diff --git a/CMakeLists.txt b/CMakeLists.txt
index 586628dc48d2d61d130fb2719c45150fdb41a01e..3c3e319fb7b8bae0523d7d2e1605680d59f193ce 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,10 @@ include(generic)            # simplify cmake module
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+if (WITH_GPU  AND WITH_XPU)
+    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
     message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
diff --git a/Dockerfile b/Dockerfile
index 42a103240e882b2732f14619308cc00f010d20af..b92ac228a8d50da93f8c0bfe2f6af31fc784f2c7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+
 RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
 RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 
-RUN pip3 --no-cache-dir install coverage                
-RUN pip3.6 --no-cache-dir install coverage             
-RUN pip3.7 --no-cache-dir install coverage            
+RUN pip3 --no-cache-dir install coverage
+RUN pip3.6 --no-cache-dir install coverage
+RUN pip3.7 --no-cache-dir install coverage
 RUN pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/README.md b/README.md
index 4196811e37f73f84b0327f5cbf1996aaaf7e6dcc..d14d0ef00148140bd931bbc692fbe15bb21a7bf3 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.3.post97
+pip install paddlepaddle-gpu==1.8.4.post97
 
 ```
 It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
diff --git a/README_cn.md b/README_cn.md
index 93ad06d20010fcba1ff3382b169cb78328f2a375..e4544a3eff6e55a29fcfa806786e55e0ac41a672 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -30,7 +30,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.3.post97
+pip install paddlepaddle-gpu==1.8.4.post97
 
 ```
 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bb57b42dcc74114312a400a0f6cc95df307de6bb..cf458d97706755e794c5fbb1ba9d3fcb51e9d1ce 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
     add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
+if(WITH_XPU)
+    message(STATUS "Compile with XPU!")
+    add_definitions(-DPADDLE_WITH_XPU)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 1688d9d98b7a079b32322d03c46cd6d89b717881..b7a93cd9ee2160090c0142d62d96da72e4c58717 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
   if(NOT CUDA_gpu_detect_output)
     message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
     set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+    #Todo: fix Automatic GPU detection failed on windows
+    if(WIN32)
+      set(${out_variable} "61 75" PARENT_SCOPE)
+    endif()
   else()
     set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
   endif()
@@ -202,6 +206,11 @@ if (NOT WIN32) # windows msvc2015 support c++11 natively.
   set(CMAKE_CUDA_STANDARD 11)
 endif(NOT WIN32)
 
+# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
+# So replace /W[1-4] with /W0
+if (WIN32)
+  string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+endif(WIN32)
 # in cuda9, suppress cuda warning on eigen
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index 4a343b2c6af2ce64d65203ae5955b2d552055198..6f790f1af8e1a03d1101244a4d82045331b44c13 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
 set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
 set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
-set(CUB_TAG        1.9.8)
+set(CUB_TAG        1.8.0)
 
 cache_third_party(extern_cub
     REPOSITORY    ${CUB_REPOSITORY}
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 337e326dc166fd844a938ecd936d8c4162a45573..895bc0849a2a3b57e9e7ba2576567032f07fb35b 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -14,13 +14,21 @@
 
 INCLUDE(ExternalProject)
 
+execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
+
 SET(GLOO_PROJECT       "extern_gloo")
 IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
   MESSAGE(STATUS "use pre defined download url")
   SET(GLOO_VER "master" CACHE STRING "" FORCE)
   SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-  SET(GLOO_URL "https://pslib.bj.bcebos.com/gloo.tar.gz" CACHE STRING "" FORCE)
+
+  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
+    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
+  else()
+    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
+  endif()
 ENDIF()
+
 MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
 SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
 SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index b541d73bc6a633d8e6a77ff567d756f3b40bfce9..8a655b2954dea5d6b864616ed2f4d19b167c4be8 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
+    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9f3606138defa04f979d8bea348e7bfda181af68..ae870b766fc3349ea53628e14c68ab9a5826213f 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            fb95345126ade4c54f5507e580a5f5da8d30a515)
+SET(MKLDNN_TAG            1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8a927d8e282a03e8a74c0814ee8d9b247451a091
--- /dev/null
+++ b/cmake/external/xpu.cmake
@@ -0,0 +1,54 @@
+if (NOT WITH_XPU)
+    return()
+endif()
+
+INCLUDE(ExternalProject)
+SET(XPU_PROJECT                 "extern_xpu")
+SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
+SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB_NAME            "libxpuapi.so")
+SET(XPU_RT_LIB_NAME             "libxpurt.so")
+SET(XPU_SIM_LIB_NAME            "libxpusim.so")
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
+                          && tar xvf xpu.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
+
+# generate a static dummy target to track xpulib dependencies
+# for cc_library(xxx SRCS xxx.c DEPS xpulib)
+generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
+
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 64878693518b686cc208c293c0ad0b410fa26058..9d07a0979d9392c9b2ab78562f8e0ceb8fc5d722 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -232,7 +232,9 @@ if(WIN32)
         CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
-        set(flag_var "${flag_var} /w")
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
     endforeach(flag_var)
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 8842e8e21c6df224bb6341a4f7f526e3d61e92e1..1956e5c39ea2524d8a8e2650eb08f8d58f410b73 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    # No unit test should exceed 2 minutes.
+    if (APPLE OR WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+    endif()
   endif()
 endfunction()
 
@@ -742,9 +746,14 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
+    
+    if (APPLE OR WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    else()
+        # No unit test should exceed 2 minutes in Linux.
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+    endif()
 
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction()
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 5a889dbc3143833ff48a972d17efc0aaf63f1810..20f27715e00457a8fe43f5c620e2a005387d7988 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
             SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
             DSTS ${dst_dir} ${dst_dir}/lib)
 
+    if (WITH_CRYPTO)
         set(dst_dir "${DST}/third_party/install/cryptopp")
         copy(${TARGET}
-        SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
+            SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib)
+    endif()
 
     set(dst_dir "${DST}/third_party/install/xxhash")
     copy(${TARGET}
@@ -187,7 +189,7 @@ copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
 copy(inference_lib_dist
-        SRCS  ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
diff --git a/cmake/init.cmake b/cmake/init.cmake
index a33bfdbd412b15ef8d35c18b38ba6e18a1d03b11..7dfe60f9dd8f021facba6925a465cb58bc5de25d 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -1,29 +1,29 @@
 # Attention: cmake will append these flags to compile command automatically.
 # So if you want to add global option, change this file rather than flags.cmake
 
-# default: "-g"
-set(CMAKE_C_FLAGS_DEBUG "-g")
-# default: "-O3 -DNDEBUG"
-set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
-# default: "-O2 -g -DNDEBUG"
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-# default: "-Os -DNDEBUG"
-set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+# NOT WIN32
+# DEBUG:  default: "-g"
+# RELEASE:  default: "-O3 -DNDEBUG"
+# RELWITHDEBINFO: default: "-O2 -g -DNDEBUG"
+# MINSIZEREL: default: "-O2 -g -DNDEBUG"
+
+if(NOT WIN32)
+    set(CMAKE_C_FLAGS_DEBUG "-g")
+    set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+
+    set(CMAKE_CXX_FLAGS_DEBUG "-g")
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+endif()
+
+if(WITH_GPU)
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
+endif()
 
-# default: "-g"
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-# default: "-O3 -DNDEBUG"
-set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
-# default: "-O2 -g -DNDEBUG"
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-# default: "-Os -DNDEBUG"
-set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 
-# default: "-g"
-set(CMAKE_CUDA_FLAGS_DEBUG "-g")
-# default: "-O3 -DNDEBUG"
-set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
-# default: "-O2 -g -DNDEBUG"
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-# default: "-O1 -DNDEBUG"
-set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index ecf2dbc81762a59d4d826ae8f5dfc0ab48a28910..f60a6dc3f0c89dd345b04ea3a1e213de770e5760 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -8,12 +8,13 @@ function(op_library TARGET)
     set(hip_cu_srcs)
     set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
+    set(xpu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(cudnn_cu_srcs)
     set(CUDNN_FILE)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function layer)
+    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -60,6 +61,12 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
             endif()
         endif()
+        if(WITH_XPU)
+            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            endif()
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.hip.cu$")
@@ -76,6 +83,8 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
+            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+                list(APPEND xpu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -109,7 +118,7 @@ function(op_library TARGET)
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
     endif()
 
@@ -150,10 +159,11 @@ function(op_library TARGET)
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH hip_cu_srcs hip_cu_srcs_len)
     list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -179,6 +189,9 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
     endif()
 
+    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
       # Append first implemented MKLDNN activation operator
@@ -228,6 +241,7 @@ function(register_operators)
 
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE "_xpu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
     list(LENGTH register_operators_DEPS register_operators_DEPS_len)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index be536b2eefbb123d73ec6f8d17c3d22e5aca2cfc..c9442e8f843ac152cac02908799a8d24f5951e58 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,11 @@ if(WITH_GPU)
     file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
 
+if(WITH_XPU)
+    include(external/xpu)          # download, build, install xpu
+    list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
+
 if(WITH_PSLIB)
     include(external/pslib)          # download, build, install pslib
     list(APPEND third_party_deps extern_pslib)
@@ -263,10 +268,6 @@ if(WITH_PSLIB)
     endif()
 endif(WITH_PSLIB)
 
-if(NOT WIN32 AND NOT APPLE)
-    include(external/gloo)
-    list(APPEND third_party_deps extern_gloo)
-endif()
 
 if(WITH_BOX_PS)
     include(external/box_ps)
@@ -274,6 +275,11 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 
 if(WITH_DISTRIBUTE)
+    if(WITH_GLOO)
+        include(external/gloo)
+        list(APPEND third_party_deps extern_gloo)
+    endif()
+
     if(WITH_GRPC)
         list(APPEND third_party_deps extern_grpc)
     else()
diff --git a/go/paddle/config.go b/go/paddle/config.go
index cea69e716bffada9e5565eacf8ac1af84ae5b930..c4f39fa9c5d627a689c064bbbd2178cd1ae1a929 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
 	C.PD_EnableMkldnnQuantizer(config.c)
 }
 
+func (config *AnalysisConfig) EnableMkldnnBfloat16() {
+	C.PD_EnableMkldnnBfloat16(config.c)
+}
+
 func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
 	return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
 }
 
+func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
+	return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
+}
 // SetModelBuffer
 // ModelFromMemory
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index ff2d08bb772605e5b214b41037e301ced2e85dcf..9d5c0cc7048f7db539c090d28c6184ac6d72d75a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -119,9 +119,13 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
         framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
-cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
+cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
+
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
+
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
@@ -164,23 +168,23 @@ if(WITH_PYTHON)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
-      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   else(NOT WIN32)
     string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
-    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/")
+    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
-	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
+	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-	  COMMENT "Copy generated python proto into directory paddle/fleet/proto."
+	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()
@@ -268,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
 
 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
+cc_library(generator SRCS generator.cc)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 0bd6a79b55392e8bfb8f33b0a29b4cf1df0d44dc..5574a55e18c6d9806cb878dc69ec597f81da97d8 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
-      dst_item->ShareDataWith(src_item);
+      TensorCopy(src_item, platform::CPUPlace(), dst_item);
     }
   } else {
     dst_item->clear();
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 05b7a16f1594f370cbf73ab7fdb4c98e3bb76024..551d1342edeb335d1cad4782f85ae9f94f8739bd 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -55,9 +55,8 @@ message LarsConfig {
 }
 
 message LambConfig {
-  optional float beta1 = 1 [ default = 0.001 ];
-  optional float beta2 = 2 [ default = 0.999 ];
-  optional float epsilon = 3 [ default = 0.000001 ];
+  optional float lamb_weight_decay = 1 [ default = 0.01 ];
+  repeated string exclude_from_weight_decay = 2;
 }
 
 message BuildStrategy {
@@ -80,7 +79,7 @@ message ExecutionStrategy {
 }
 
 message AsyncConfig {
-  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 k_steps = 1 [ default = -1 ];
   optional int32 max_merge_var_num = 2 [ default = 1 ];
   optional int32 send_queue_size = 3 [ default = 16 ];
   optional bool independent_recv_thread = 4 [ default = false ];
@@ -114,7 +113,9 @@ message DistributedStrategy {
   optional bool fuse_all_reduce_ops = 18 [ default = true ];
   optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
   optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
-  // optional bool enable_backward_optimizer_op_deps = 19 [ default = true ];
+  optional bool cudnn_exhaustive_search = 21 [ default = true ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index f2421248e33f236b9fa861f22ce4848531cf1791..180b33d0cb72e2c4c9e6e8caff9f0ef5f1b04689 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
     return ctx;
   }
 
+  inline ::DLContext operator()(const platform::XPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8e2e1d38a66d1039519bab312f77bef6604d8ec1..f11edb9a41bdcbcb33efc600f1d7d8f70fb76f45 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
+#ifdef PADDLE_WITH_CUDA
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
         gc.reset(new DefaultStreamGarbageCollector(
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
       }
-    } else if (platform::is_cpu_place(place_)) {
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
+    } else if (platform::is_cpu_place(place_)) {
       gc.reset(new CPUGarbageCollector(
           BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
-#ifdef PADDLE_WITH_CUDA
-    }
+    } else if (platform::is_xpu_place(place_)) {
+#ifdef PADDLE_WITH_XPU
+      gc.reset(new XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
+    }
   }
 
   for (int64_t i = start_op_index; i < end_op_index; ++i) {
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 0d62488bfe67a316f4840107508129c49b36f23c..3eee0a1abbaf04aef2faa9e52c552e89ce84c7de 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
     cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)
 
-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context)
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
 
 cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index 3f932ee226ca85b86c92fef1b30420d782d9bc62..758cde78530d7b334a7100bce6ce32c2869cc066 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP };
 
 class GlooWrapper {
  public:
+  static std::shared_ptr<GlooWrapper> GetInstance() {
+    static auto s_instance = std::make_shared<GlooWrapper>();
+    return s_instance;
+  }
+
   GlooWrapper() {}
 
   virtual ~GlooWrapper() {}
@@ -153,6 +158,11 @@ class GlooWrapper {
 #endif
   }
 
+  bool IsInitialized() { return is_initialized_; }
+#ifdef PADDLE_WITH_GLOO
+  std::shared_ptr<gloo::Context> GetContext() { return context_; }
+#endif
+
   template <typename T>
   std::vector<T> AllReduce(std::vector<T>& sendbuf,            // NOLINT
                            const std::string& mode = "sum") {  // NOLINT
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 84b5502ff7b369452e7c9988d185450934c78b03..29312370b3448bfe3c04b914ce0748eb1a66cf32 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -115,6 +115,7 @@ message VarType {
     SIZE_T = 19;
     UINT8 = 20;
     INT8 = 21;
+    BF16 = 22;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index ac892443de36cf6d37d56da761fb3d60628a5e4a..f69ada080676cddfa4f31c6cbc450b8eca28b3ac 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
   callback();
 }
 
+#ifdef PADDLE_WITH_XPU
+XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 2212122c03de3416c91fcc46bf510bbc02d4302e..4f7739652822b9047b1798b6bd66261effbe2f49 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
   void ClearCallback(const std::function<void()> &callback) override;
 };
 
+#ifdef PADDLE_WITH_XPU
+class XPUGarbageCollector : public GarbageCollector {
+ public:
+  XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d00e38784c2c0415a59a33fc24d708c253481c21
--- /dev/null
+++ b/paddle/fluid/framework/generator.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
+
+GeneratorState* Generator::GetState() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_.get();
+}
+
+void Generator::SetState(GeneratorState* state_in) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  *this->state_ = *state_in;
+}
+
+uint64_t Generator::GetCurrentSeed() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->current_seed;
+}
+
+uint64_t Generator::Seed() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  uint64_t seed;
+  std::random_device de;
+  seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
+  this->state_->current_seed = seed;
+  std::seed_seq seq({seed});
+  this->state_->cpu_engine.seed(seq);
+
+  return this->state_->current_seed;
+}
+
+void Generator::SetCurrentSeed(uint64_t seed) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  this->state_->current_seed = uint64_t(seed);
+  std::seed_seq seq({seed});
+  this->state_->cpu_engine.seed(seq);
+}
+
+std::mt19937_64& Generator::GetCPUEngine() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->cpu_engine;
+}
+
+void Generator::SetCPUEngine(std::mt19937_64 engine) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  this->state_->cpu_engine = std::mt19937_64(engine);
+}
+
+uint64_t Generator::Random64() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->cpu_engine();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..17870782ba72a3247de734642962ffec48c0c91e
--- /dev/null
+++ b/paddle/fluid/framework/generator.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <deque>
+#include <iostream>  // temp for debug
+#include <memory>
+#include <mutex>  // NOLINT
+#include <random>
+#include <typeinfo>
+#include <utility>
+
+namespace paddle {
+namespace framework {
+
+struct GeneratorState {
+  int64_t device = -1;
+  uint64_t current_seed = 34342423252;
+  std::mt19937_64 cpu_engine;
+};
+
+struct Generator {
+  Generator() {
+    GeneratorState default_gen_state_cpu;
+    default_gen_state_cpu.device = -1;
+    default_gen_state_cpu.current_seed = 34342423252;
+    std::seed_seq seq({34342423252});
+    default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
+    this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
+  }
+  explicit Generator(GeneratorState state_in)
+      : state_{std::make_shared<GeneratorState>(state_in)} {}
+  Generator(const Generator& other)
+      : Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
+
+  // get random state
+  GeneratorState* GetState();
+  // set random state
+  void SetState(GeneratorState* state_in);
+  // get current seed
+  uint64_t GetCurrentSeed();
+  // random a seed and get
+  uint64_t Seed();
+
+  // set seed
+  void SetCurrentSeed(uint64_t seed);
+  // get cpu engine
+  std::mt19937_64& GetCPUEngine();
+  // set cpu engine
+  void SetCPUEngine(std::mt19937_64 engine);
+
+  uint64_t Random64();
+
+  bool is_init_py = false;
+
+  // CPU Generator singleton
+  static std::shared_ptr<Generator> GetInstance() {
+    if (NULL == gen_instance_) {
+      gen_instance_.reset(new paddle::framework::Generator());
+    }
+    return gen_instance_;
+  }
+
+  static std::shared_ptr<Generator> GetInstanceX() {
+    if (NULL == gen_instance_) {
+      gen_instance_.reset(new paddle::framework::Generator());
+    }
+    gen_instance_->is_init_py = true;
+    return gen_instance_;
+  }
+
+ private:
+  static std::shared_ptr<Generator> gen_instance_;
+  std::shared_ptr<GeneratorState> state_;
+  mutable std::mutex mutex;
+
+  Generator(const Generator& other, const std::lock_guard<std::mutex>&)
+      : state_(std::make_shared<GeneratorState>(*(other.state_))) {}
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index c258028e25066d14820017edaaa103b39c57158d..316f8f9c7515ee0255a261645caa38a9807c3fc3 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -16,9 +16,6 @@
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef ON_INFER
-#include "paddle/fluid/inference/api/paddle_api.h"
-#endif
 namespace paddle {
 namespace framework {
 
@@ -59,7 +56,7 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher(
 }
 
 }  // namespace framework
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) {
   return framework::CipherFactory::CreateCipher(config_file);
 }
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 60e4ac8cbcfd8cc8f1d14363538fe1e118b953cd..9d3e0806ac79d838765ca5a4bbf61d0f67ab6ed5 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
               paddle::framework::ir::ConvTransposeBNFusePass);
 REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
               paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
+REGISTER_PASS(depthwise_conv_bn_fuse_pass,
+              paddle::framework::ir::DepthwiseConvBNFusePass);
+REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
+              paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index fcdbcf299c504c00b3027207bc2f4ac019d48ffc..57a9f69ca15af2759874a1e2a0b58399de652693 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
   std::string conv_type() const { return "conv2d_transpose"; }
 };
 
+class DepthwiseConvBNFusePass : public ConvBNFusePass {
+ public:
+  std::string conv_type() const { return "depthwise_conv2d"; }
+};
+
+class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
+ public:
+  std::string conv_type() const { return "depthwise_conv2d"; }
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 431d3c05f6dd4b44074729716555744773f950e7..55449856d189065388facf3e3ce736f505e976fb 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -68,11 +68,35 @@ static bool HasInput(Node* n, std::string name) {
   return input_names_set.find(name) != input_names_set.end();
 }
 
+static Node* GetInputVar(Node* n, const std::string& name) {
+  PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
+                    platform::errors::InvalidArgument(
+                        "Expected node %p to be an operator node.", n));
+  for (auto* in : n->inputs) {
+    if (in->Name() == name) {
+      return in;
+    }
+  }
+  return nullptr;
+}
+
+static Node* GetOutputVar(Node* n, const std::string& name) {
+  PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
+                    platform::errors::InvalidArgument(
+                        "Expected node %p to be an operator node.", n));
+  for (auto* out : n->outputs) {
+    if (out->Name() == name) {
+      return out;
+    }
+  }
+  return nullptr;
+}
+
 std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
     SubGraph* subgraph) {
-  std::unordered_map<std::string, int> var_ids = EncodeVarNodes(subgraph);
-  std::vector<Node*> intermediate_out_nodes =
-      subgraph->GetIntermediateOutVarNodes();
+  std::unordered_map<Node*, int> var_ids = EncodeVarNodes(subgraph);
+  std::unordered_set<Node*> intermediate_out_vars_set =
+      subgraph->GetIntermediateOutVarNodesSet();
   std::vector<OperationExpression> expressions;
   for (auto* node : subgraph->SortedNodes()) {
     if (node && node->IsOp() && node->Op()) {
@@ -92,11 +116,12 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
         // "elementwise_add_grad", where "X", "Y" and "Out" are not used.
         if ((HasInput(node, name) && op->Input(name).size() >= 1U)) {
           for (size_t i = 0; i < op->Input(name).size(); i++) {
+            Node* input_var = GetInputVar(node, op->Input(name)[i]);
             PADDLE_ENFORCE_NE(
-                var_ids.find(op->Input(name)[i]), var_ids.end(),
+                var_ids.find(input_var), var_ids.end(),
                 platform::errors::InvalidArgument(
                     "Input(%s) of operation %s is not set.", name, op->Type()));
-            input_ids.push_back(var_ids[op->Input(name)[i]]);
+            input_ids.push_back(var_ids[input_var]);
           }
         } else {
           input_ids.push_back(-1);
@@ -106,31 +131,29 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
       // Output ids should be set in fixed order, like:
       //  - dx, dy in backward operations
       std::vector<int> output_ids;
+      std::vector<int> intermediate_output_ids;
       std::vector<std::string> output_names =
           OperationMap::Instance().Get(op->Type()).output_names;
-      std::unordered_map<int, bool> intermediate_state;
 
       for (auto& name : output_names) {
+        Node* output_var = GetOutputVar(node, op->Output(name)[0]);
         PADDLE_ENFORCE_NE(
-            var_ids.find(op->Output(name)[0]), var_ids.end(),
+            var_ids.find(output_var), var_ids.end(),
             platform::errors::InvalidArgument(
                 "Output(%s) of operation %s is not set.", name, op->Type()));
-        output_ids.push_back(var_ids[op->Output(name)[0]]);
-        bool enable_intermediate = false;
-        for (auto* n : intermediate_out_nodes) {
-          if (n->Name() == op->Output(name)[0]) {
-            enable_intermediate = true;
-            break;
-          }
+        output_ids.push_back(var_ids[output_var]);
+        if (!subgraph->SaveIntermediateOut() &&
+            intermediate_out_vars_set.find(output_var) !=
+                intermediate_out_vars_set.end()) {
+          intermediate_output_ids.push_back(var_ids[output_var]);
         }
-        intermediate_state[var_ids[op->Output(name)[0]]] = enable_intermediate;
       }
 
       std::string lhs_type = ExtractDataType(node->outputs);
       std::string rhs_type = ExtractDataType(node->inputs);
       auto expression =
           OperationExpression(node->Name(), input_ids, output_ids, rhs_type,
-                              lhs_type, intermediate_state);
+                              lhs_type, intermediate_output_ids);
       expression.SetAttr(attr);
       expressions.push_back(expression);
     }
@@ -146,17 +169,18 @@ std::string CodeGenerator::Generate(
   // TODO(liuyiqun): Check whether all expressions are elementwise operations.
   std::set<int> input_ids = std::move(DistilInputIds(expressions));
   std::set<int> output_ids = std::move(DistilOutputIds(expressions));
-  std::set<int> intermediate_ids =
+  std::set<int> intermediate_output_ids =
       std::move(DistilIntermediateIds(expressions));
   std::unordered_map<int, std::string> dtypes =
       std::move(DistilDtypes(expressions));
   TemplateVariable template_var;
   template_var.Add("func_name", func_name);
-  template_var.Add("parameters", EmitParameters(input_ids, output_ids,
-                                                intermediate_ids, dtypes));
+  template_var.Add(
+      "parameters",
+      EmitParameters(input_ids, output_ids, intermediate_output_ids, dtypes));
   template_var.Add("compute_body",
                    EmitComputeBody(expressions, input_ids, output_ids,
-                                   intermediate_ids, dtypes));
+                                   intermediate_output_ids, dtypes));
 
   std::set<std::string> all_dtype;
   for (const auto& type : dtypes) {
@@ -204,18 +228,14 @@ std::set<int> CodeGenerator::DistilOutputIds(
 
 std::set<int> CodeGenerator::DistilIntermediateIds(
     const std::vector<OperationExpression>& expressions) {
-  std::set<int> intermediate_ids;
+  std::set<int> intermediate_output_ids;
   // Use std::set to remove the reptead id and get a ordered list.
   for (size_t i = 0; i < expressions.size(); i++) {
-    for (auto id : expressions[i].GetOutputIds()) {
-      auto intermediate_state = expressions[i].GetIntermediateState();
-      if (intermediate_state.find(id) != intermediate_state.end() &&
-          intermediate_state[id]) {
-        intermediate_ids.insert(id);
-      }
+    for (auto id : expressions[i].GetIntermediateOutputIds()) {
+      intermediate_output_ids.insert(id);
     }
   }
-  return intermediate_ids;
+  return intermediate_output_ids;
 }
 
 std::unordered_map<int, std::string> CodeGenerator::DistilDtypes(
@@ -316,26 +336,29 @@ std::string CodeGenerator::EmitComputeBody(
   return load.str() + compute.str() + store.str();
 }
 
-std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes(
+std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
     SubGraph* subgraph) {
   const auto& input_var_nodes = subgraph->GetInputVarNodes();
-  const auto& output_var_nodes = subgraph->GetOutputVarNodes();
+  // Encode all var nodes, including intermediate output var nodes.
+  const auto& output_var_nodes = subgraph->GetOutputVarNodes(true);
 
   int id = 0;
-  std::unordered_map<std::string, int> var_ids;
+  std::unordered_map<Node*, int> var_ids;
   // Numbering input vars.
   for (auto* in : input_var_nodes) {
-    VLOG(3) << "Encoding input names:" << in->Name() << ", id:" << id;
-    if (var_ids.find(in->Name()) == var_ids.end()) {
-      var_ids[in->Name()] = id++;
+    VLOG(3) << "Encoding input names:" << in->Name() << "(" << in
+            << "), id:" << id;
+    if (var_ids.find(in) == var_ids.end()) {
+      var_ids[in] = id++;
     }
   }
 
   // Encoding output vars.
   for (auto* out : output_var_nodes) {
-    VLOG(3) << "Ecoding output names:" << out->Name() << ", id:" << id;
-    if (var_ids.find(out->Name()) == var_ids.end()) {
-      var_ids[out->Name()] = id++;
+    VLOG(3) << "Ecoding output names:" << out->Name() << "(" << out
+            << "), id:" << id;
+    if (var_ids.find(out) == var_ids.end()) {
+      var_ids[out] = id++;
     }
   }
   return var_ids;
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.h b/paddle/fluid/framework/ir/fusion_group/code_generator.h
index 2b18657bbcfbe81d4504306a3753d5c0b82092fd..21773f239b9f6e5208aea45f481bf6f92745033f 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h
@@ -61,7 +61,7 @@ class CodeGenerator {
       const std::unordered_map<int, std::string>& dtypes) const;
 
   // Encode all var nodes in the subgraph with an unique number.
-  std::unordered_map<std::string, int> EncodeVarNodes(SubGraph* subgraph);
+  std::unordered_map<Node*, int> EncodeVarNodes(SubGraph* subgraph);
 
  private:
   std::vector<CodeTemplate> code_templates_;
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
index 03d28277afbbb7467f638d521414d702bc8e8179..910f71e65bed10a515f9401a5b09a27ba0929fcf 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -48,20 +48,20 @@ class OperationExpression {
       std::string op_type, const std::vector<int>& input_ids,
       const std::vector<int>& output_ids, std::string rhs_type,
       std::string lhs_type,
-      const std::unordered_map<int, bool>& intermediate_state = {})
+      const std::vector<int>& intermediate_output_ids = {})
       : op_type_(op_type),
         input_ids_(input_ids),
         output_ids_(output_ids),
         rhs_type_(rhs_type),
         lhs_type_(lhs_type),
-        intermediate_state_(intermediate_state) {}
+        intermediate_output_ids_(intermediate_output_ids) {}
 
   std::string GetOpType() const { return op_type_; }
-  std::unordered_map<int, bool> GetIntermediateState() const {
-    return intermediate_state_;
-  }
   std::vector<int> GetInputIds() const { return input_ids_; }
   std::vector<int> GetOutputIds() const { return output_ids_; }
+  std::vector<int> GetIntermediateOutputIds() const {
+    return intermediate_output_ids_;
+  }
   std::string GetRHSType() const { return rhs_type_; }
   std::string GetLHSType() const { return lhs_type_; }
   void SetAttr(AttributeMap attr) { attr_ = attr; }
@@ -84,7 +84,7 @@ class OperationExpression {
   AttributeMap attr_;
   std::string rhs_type_;
   std::string lhs_type_;
-  std::unordered_map<int, bool> intermediate_state_;
+  std::vector<int> intermediate_output_ids_;
 };
 
 class TemplateVariable {
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 89b05fc577bb46606ff5c43d0dd697bd7b8aed38..ebc89b14c265d3491f0f9bc64a36f52c6c9f2a18 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -144,7 +144,6 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
       LOG(INFO) << "Precision check failed from i = " << id
                 << ", expect: " << expect << ", actual: " << actual;
       EXPECT_LT(fabs(actual - expect), eps);
-      break;
     }
   }
 }
@@ -465,7 +464,7 @@ TEST(code_generator, subgraph) {
   for (std::string dtype : {"float", "__half"}) {
     std::unique_ptr<paddle::framework::ir::Graph> graph =
         BuildGraph(false, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false,
+    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
                                     graph->Nodes());
 
     // Expressions generated by code_generator (they may be different):
@@ -484,7 +483,7 @@ TEST(code_generator, subgraph_grad) {
   for (std::string dtype : {"float", "__half"}) {
     std::unique_ptr<paddle::framework::ir::Graph> graph =
         BuildGraph(true, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false,
+    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
                                     DistilGradNodes(graph));
 
     // Expressions generated by code_generator (they may be different):
diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
index 5de253bb96743dc18b6394f04d7818a090a114c2..f6262762a2af6e1abec47fca2bce85a74116b5fd 100644
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
@@ -63,7 +63,7 @@ static bool IsEqualAndNotEmpty(const std::vector<int64_t>& l,
 bool GroupDetector::CheckPrecondition(const Node* n) {
   auto check_data_type = [&](const std::vector<Node*>& nodes) -> bool {
     bool is_first = true;
-    proto::VarType::Type data_type_0;
+    proto::VarType::Type data_type_0 = proto::VarType::BOOL;
     for (auto* n : nodes) {
       if (n && n->IsVar() && n->Var()) {
         if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
index 883347085926f08adb877d6a7fbe8e5c5e8e1c50..2cf71cdcefcd595c85da63ecb0782d16de5dddb8 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -63,11 +63,6 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
         std::unordered_set<Node*>(vec.begin(), vec.end()));
     VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n";
 
-    // In elementwise fused kernel, memory is the bound of execution,
-    // here we remove the output id to use less memory and less time.
-    if (subgraph.RemoveIntermediateOut()) {
-      subgraph.DetectIntermediateOutWithGraph(graph);
-    }
     if (subgraph.IsValid(min_subgraph_size)) {
       subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
       if (GenerateCode(&subgraph)) {
@@ -115,57 +110,52 @@ static int ExtractOpRole(fusion_group::SubGraph* subgraph) {
 
 void FusionGroupPass::InsertFusionGroupOp(
     Graph* graph, fusion_group::SubGraph* subgraph) const {
-  const std::vector<Node*>& input_vars_of_subgraph =
-      subgraph->GetInputVarNodes();
-  const std::vector<Node*>& output_vars_of_subgraph =
-      subgraph->GetOutputVarNodes();
-  const std::vector<Node*> intermediate_vars_of_subgraph =
-      subgraph->GetIntermediateOutVarNodes();
+  const std::vector<Node*>& input_vars = subgraph->GetInputVarNodes();
+  const std::vector<Node*>& output_vars =
+      subgraph->GetOutputVarNodes(subgraph->SaveIntermediateOut());
   std::unordered_set<Node*> external_nodes;
 
-  OpDesc op_desc;
-  op_desc.SetType("fusion_group");
-
+  // Prepare inputs.
   std::vector<std::string> input_names;
-  std::vector<std::string> inputs_data_types;
-  for (auto* n : input_vars_of_subgraph) {
-    input_names.push_back(n->Name());
-    inputs_data_types.push_back(DataTypeToString(n->Var()->GetDataType()));
-    external_nodes.insert(n);
+  std::vector<int> input_dtypes;
+  std::unordered_set<Node*> output_vars_set(output_vars.begin(),
+                                            output_vars.end());
+  for (auto* n : input_vars) {
+    // It is not an output var node.
+    if (output_vars_set.find(n) == output_vars_set.end()) {
+      input_names.push_back(n->Name());
+      input_dtypes.push_back(n->Var()->GetDataType());
+      external_nodes.insert(n);
+    }
   }
-  op_desc.SetInput("Inputs", input_names);
 
+  // Prepare outputs.
   std::vector<std::string> output_names;
-  std::vector<std::string> outs_data_types;
-  std::vector<Node*> output_var_without_intermediate;
-  for (auto* n : output_vars_of_subgraph) {
-    auto it_input =
-        find(input_vars_of_subgraph.begin(), input_vars_of_subgraph.end(), n);
-    auto it_intermediate = find(intermediate_vars_of_subgraph.begin(),
-                                intermediate_vars_of_subgraph.end(), n);
-    if (it_intermediate == intermediate_vars_of_subgraph.end() &&
-        it_input == input_vars_of_subgraph.end()) {
-      output_names.push_back(n->Name());
-      outs_data_types.push_back(DataTypeToString(n->Var()->GetDataType()));
-      output_var_without_intermediate.push_back(n);
-    }
+  std::vector<int> output_dtypes;
+  for (auto* n : output_vars) {
+    output_names.push_back(n->Name());
+    output_dtypes.push_back(n->Var()->GetDataType());
     external_nodes.insert(n);
   }
 
+  OpDesc op_desc;
+  op_desc.SetType("fusion_group");
+  op_desc.SetInput("Inputs", input_names);
   op_desc.SetOutput("Outs", output_names);
-  op_desc.SetAttr("inputs_data_type", inputs_data_types);
-  op_desc.SetAttr("outs_data_type", outs_data_types);
+  op_desc.SetAttr("inputs_dtype", input_dtypes);
+  op_desc.SetAttr("outs_dtype", output_dtypes);
   op_desc.SetAttr("type", subgraph->GetType());
   op_desc.SetAttr("func_name", subgraph->GetFuncName());
   op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                   ExtractOpRole(subgraph));
 
   Node* fusion_group_node = graph->CreateOpNode(&op_desc);
-  for (auto* in : input_vars_of_subgraph) {
-    IR_NODE_LINK_TO(in, fusion_group_node);
+  for (auto* in : input_vars) {
+    if (output_vars_set.find(in) == output_vars_set.end()) {
+      IR_NODE_LINK_TO(in, fusion_group_node);
+    }
   }
-
-  for (auto* out : output_var_without_intermediate) {
+  for (auto* out : output_vars) {
     IR_NODE_LINK_TO(fusion_group_node, out);
   }
 
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index b127d132bafb00f32e7e6c5d2681d6f0e78b4c34..921cf0904f632936862b18b2f083f18a33c760be 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -105,12 +105,6 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}",
                  {"${2} * (%{1.0} - ${1} * ${1})"});
 
-  // cast:
-  // out = static_cast<T>(x)
-  // TODO(wangchaochaohu): This is not the compelete definition of
-  // cast Op, We need refine it later.
-  insert_handler("cast", "${0}", {});
-
   // sqrt:
   //  out = x^(1/2)
   //  dx = dout * 0.5 / out
@@ -121,11 +115,21 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   //  dx = dout * 2.0 * x
   insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"});
 
+  // assign:
+  //  out = x
+  insert_handler("assign", "${0}", {});
+
+  // cast:
+  //  out = static_cast<T>(x)
+  // TODO(wangchaochaohu): This is not the compelete definition of
+  //  cast Op, We need refine it later.
+  insert_handler("cast", "${0}", {});
+
   // scale
-  // out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
-  // here we use '=' operator to seperate th default value
+  //  out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
+  //  here we use '=' operator to seperate th default value
   // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
-  // bias.
+  //  bias.
   insert_handler(
       "scale",
       "${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + "
diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h
index 66b17e9f6fe95519b7687bc1c5725684c5c98610..5a29e875aea615c36711aa7dc044e4e1f563c297 100644
--- a/paddle/fluid/framework/ir/fusion_group/subgraph.h
+++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h
@@ -66,11 +66,12 @@ class SubGraph {
   }
 
   int GetType() const { return type_; }
-  bool RemoveIntermediateOut() { return !save_intermediate_out_; }
 
   void SetFuncName(std::string func_name) { func_name_ = func_name; }
   std::string GetFuncName() const { return func_name_; }
 
+  bool SaveIntermediateOut() const { return save_intermediate_out_; }
+
   const std::unordered_set<Node*>& Nodes() const { return nodes_set_; }
   const std::vector<Node*>& SortedNodes() {
     if (!is_sorted_) {
@@ -118,66 +119,88 @@ class SubGraph {
     return input_vars;
   }
 
-  std::vector<Node*> GetOutputVarNodes() {
+  std::vector<Node*> GetOutputVarNodes(bool with_intermediate_out) {
     // The order of output nodes should be consistant anywhere..
-    std::vector<Node*> output_vars_all;
+    std::vector<Node*> output_vars;
     for (auto* n : SortedNodes()) {
-      if (n && n->IsVar() && n->Var()) {
+      if (IsOutputOfInternalOp(n)) {
         // If the var_node is the output of some op_node in the subgraph, it
         // is considered the output var node of the subgraph.
-        bool is_found = false;
-        for (auto* in : n->inputs) {
-          if (Has(in)) {
-            is_found = true;
+        if (with_intermediate_out) {
+          output_vars.push_back(n);
+        } else {
+          if (n->outputs.empty() || IsInputOfExternalOp(n)) {
+            output_vars.push_back(n);
           }
         }
-        if (is_found) {
-          output_vars_all.push_back(n);
-        }
       }
     }
-    return output_vars_all;
+    return output_vars;
   }
 
   std::vector<Node*> GetIntermediateOutVarNodes() {
-    return intermediate_out_nodes_;
+    // Intermediate output var nodes: the output of some op_node in the
+    // subgraph, but not referenced outside the subgraph.
+    std::vector<Node*> intermediate_out_vars;
+    for (auto* n : SortedNodes()) {
+      if (IsOutputOfInternalOp(n) && IsInputOfInternalOp(n) &&
+          !IsInputOfExternalOp(n)) {
+        // When the outputs size is 0, it is also considered a intermidiate
+        // output. It maybe an unused output or the fetching vars, so that we
+        // cannot eleiminate it directly here.
+        intermediate_out_vars.push_back(n);
+      }
+    }
+    return intermediate_out_vars;
   }
 
-  void DetectIntermediateOutWithGraph(Graph* graph) {
-    auto graph_nodes = graph->Nodes();
-
-    for (auto* n : SortedNodes()) {
-      bool enable_remove = true;
+  std::unordered_set<Node*> GetIntermediateOutVarNodesSet() {
+    std::vector<Node*> intermediate_out_vars = GetIntermediateOutVarNodes();
+    return std::unordered_set<Node*>(intermediate_out_vars.begin(),
+                                     intermediate_out_vars.end());
+  }
 
-      if (n && n->IsVar() && n->Var()) {
-        bool leaf_graph = true;
-        for (auto* node : graph_nodes) {
-          if (node->IsOp()) {
-            auto inputs = node->inputs;
-            for (auto* in : inputs) {
-              if (in && in->Name() == n->Name()) {
-                if (!Has(node)) enable_remove = false;
-                leaf_graph = false;
-              }
-            }
-          }
-          if (!enable_remove) {
-            break;
-          }
+ private:
+  bool IsInputOfInternalOp(Node* n) {
+    bool is_input_of_internal_op = false;
+    if (Has(n) && n && n->IsVar() && n->Var()) {
+      for (auto* out : n->outputs) {
+        if (Has(out)) {
+          is_input_of_internal_op = true;
+          break;
         }
-        if (leaf_graph) enable_remove = false;
+      }
+    }
+    return is_input_of_internal_op;
+  }
 
-      } else {
-        enable_remove = false;
+  bool IsInputOfExternalOp(Node* n) {
+    // If n is the input any one node outside the subgraph.
+    bool is_input_of_external_op = false;
+    if (Has(n) && n && n->IsVar() && n->Var()) {
+      for (auto* out : n->outputs) {
+        if (!Has(out)) {
+          is_input_of_external_op = true;
+          break;
+        }
       }
+    }
+    return is_input_of_external_op;
+  }
 
-      if (enable_remove) {
-        intermediate_out_nodes_.push_back(n);
+  bool IsOutputOfInternalOp(Node* n) {
+    bool is_output_of_internal_op = false;
+    if (Has(n) && n && n->IsVar() && n->Var()) {
+      for (auto* in : n->inputs) {
+        if (Has(in)) {
+          is_output_of_internal_op = true;
+          break;
+        }
       }
     }
+    return is_output_of_internal_op;
   }
 
- private:
   void TopologicalSort() {
     if (!is_sorted_) {
       std::unordered_map<Node*, std::vector<Node*>> inputs_map;
@@ -236,7 +259,6 @@ class SubGraph {
   bool save_intermediate_out_{true};
 
   std::unordered_set<Node*> nodes_set_;
-  std::vector<Node*> intermediate_out_nodes_{};
   bool is_sorted_{false};
   std::vector<Node*> sorted_nodes_;
 };
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ff6dffa704eeceeabfc5eb1d6786f40b2e523e98..3d65fe595373fa98ba237f04134c75d4a60a7242 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() {
   return prev_out;
 }
 
+PDNode *patterns::QuantizePlacement::operator()(
+    const std::unordered_set<std::string> &quantize_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
+                                       "fc", "matmul", "pool2d", "prior_box",
+                                       "relu", "reshape2", "transpose2"});
+  if (!quantize_enabled_op_types.empty()) {
+    supported_op_types = quantize_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs",
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index e1cce7848dd54b02a540b144ca1088f62eeb52cb..0803265884165bc754489b18d07c0d277a4bd92b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase {
   PATTERN_DECL_NODE(prev_out);
 };
 
+struct QuantizePlacement : public PatternBase {
+  QuantizePlacement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "quantize_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& quantize_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 6be4ce566e01e9bcb89a38cbdc2bbd11551a065e..bc268a834780cad843a18a74bb7f50a639db103d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,30 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
-                    n->id()) != excluded_ids_list.end())
-        continue;
-      auto* op = n->Op();
-      if (op->HasAttr("mkldnn_data_type") ||
-          op->HasProtoAttr("mkldnn_data_type")) {
-        // use_quantizer is no longer used
-        // assign value for compatibility
-        if (op->GetAttrIfExists<bool>("use_quantizer")) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-        }
-        if (op_types_list.empty()) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-          op->SetAttr("use_quantizer", true);
-        } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                             op->Type()) != op_types_list.end()) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-          op->SetAttr("use_quantizer", true);
-        }
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
+                                                         "quantize_placement"};
+  quantize_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
+
+    if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                  op->id()) != excluded_ids_list.end()) {
+      return;
+    }
+
+    if (op->Op()->HasAttr("mkldnn_data_type") ||
+        op->Op()->HasProtoAttr("mkldnn_data_type")) {
+      // use_quantizer is no longer used
+      // assign value for compatibility
+      if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
+        op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
       }
+      op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
+      op->Op()->SetAttr("use_quantizer", true);
     }
-  }
+  };
+  gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
index 008a462dc414c04f53315a8f262de15ab8fb7fb5..f3229e59d6ffb97514adb9c871d4fb981fc964e0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
@@ -15,7 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
 namespace framework {
@@ -23,9 +26,10 @@ namespace ir {
 /*
  * Specifies which operators should be quantized.
  */
-class CPUQuantizePlacementPass : public Pass {
+class CPUQuantizePlacementPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
+  const std::string name_scope_{"cpu_quantize_placement_pass"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 95e321e5b7190499f98c9df3dbef217310abcfcd..761defc25ff5c89b740ccd5adff7d613beccd9d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -130,7 +130,7 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
   MainTest({"conv2d"}, {4}, 1);
 }
 
-TEST(QuantizerPlacementPass, excluded_none) {
+TEST(QuantizerPlacementPass, empty_list) {
   // all operators quantized
   MainTest({}, {}, 6);
 }
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 4506c162fa743a3fcb5973a9f0ebd9e8f6cdcd36..56ae02d49ef522fbf243d8dbc62ee319cbba425b 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
       if (quantized_op_type == "conv2d" ||
           quantized_op_type == "conv2d_fusion" ||
           quantized_op_type == "depthwise_conv2d" ||
-          quantized_op_type == "fc") {
+          quantized_op_type == "fc" ||
+          quantized_op_type == "conv2d_transpose") {
         op_desc->SetAttr("Input_scale", scale_value);
       } else if (quantized_op_type == "mul") {
         op_desc->SetAttr("X_scale", scale_value);
@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
   std::string input_name = "";
   if (quantized_op_type == "conv2d" ||
       quantized_op_type == "depthwise_conv2d" ||
-      quantized_op_type == "conv2d_fusion") {
+      quantized_op_type == "conv2d_fusion" ||
+      quantized_op_type == "conv2d_transpose") {
     weight_name = "Filter";
     input_name = "Input";
   } else if (quantized_op_type == "mul") {
@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     input_name = "Input";
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
-        "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
+        "QuantDequantFuse: We only support conv2d, conv2d_fusion, "
+        "conv2d_transpose, fc, mul for "
         "now."));
   }
   const std::string pattern_name = "dequant_fuse";
@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
     auto w_dims = weight_tensor->dims();
     // If quantized op is fc, weight scale size = 1;
-    // If quantized op is conv, weight scale size = weight dims[0]
+    // If quantized op is conv2d, weight scale size = weight dims[0]
+    // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
     bool valid_scale_size =
         (weight_scale.size() == 1 ||
-         weight_scale.size() == static_cast<size_t>(w_dims[0]));
+         weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
+         weight_scale.size() == static_cast<size_t>(w_dims[1]));
     PADDLE_ENFORCE_EQ(
         valid_scale_size, true,
         platform::errors::InvalidArgument(
@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
       if (weight_scale.size() == 1) {
         quantized_weight_data[j] *= weight_scale[0];
       } else {
-        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-        quantized_weight_data[j] *= weight_scale[j / inner_size];
+        if (quantized_op_type == "conv2d_transpose") {
+          int inner_size = w_dims[2] * w_dims[3];
+          quantized_weight_data[j] *=
+              weight_scale[(j / inner_size) % w_dims[1]];
+        } else {
+          int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
+          quantized_weight_data[j] *= weight_scale[j / inner_size];
+        }
       }
     }
 
@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     new_op_desc.SetType(quantized_op_type);
     new_op_desc.SetAttr("enable_int8", true);
     if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" ||
-        quantized_op_type == "depthwise_conv2d") {
+        quantized_op_type == "depthwise_conv2d" ||
+        quantized_op_type == "conv2d_transpose") {
       new_op_desc.SetInput("Input", {new_input});
       new_op_desc.SetOutput("Output", {new_output});
     } else if (quantized_op_type == "fc") {
@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> quant_types = {
       "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
   std::unordered_set<std::string> quantized_op_types = {
-      "conv2d", "mul", "depthwise_conv2d", "fc"};
+      "conv2d", "mul", "depthwise_conv2d", "fc", "conv2d_transpose"};
   auto* scope = param_scope();
 
   for (auto& quant_type : quant_types) {
diff --git a/paddle/fluid/framework/ir/subgraph_detector.cc b/paddle/fluid/framework/ir/subgraph_detector.cc
index 62c91af15da60b9b0a74028afb0aeb689073b524..7979953d7be827ffc944ae939782923504802bbc 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
     BriefNode *brief_node = itr.second;
 
     if (!Agent(brief_node->node).marked()) {
-      VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
+      VLOG(4) << brief_node->node->id() << " node named "
+              << brief_node->node->Name() << " is not a trt candidate.";
       continue;
     }
 
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index d46f8a574c0d956dc0a90bc2741d2cb80313ab7f..4307e51862df572e013431fceaaf89cc1cf6679c 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     // CPU, CUDA, PLAIN are same library type.
   } else if (s == std::string("CPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("XPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
     return LibraryType::kPlain;
   } else {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 66fe71a80a7b0165a0d4afb38c89fc1fdb339190..bccc92e5c4352927f309f3605bb3c8d8dd823bb5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_desc.h"
+
 #include <algorithm>
 #include <functional>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_call_stack.h"
@@ -51,23 +53,62 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   std::vector<std::string> Outputs(const std::string &name) const override;
 
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto &op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto &op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
   void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+
     std::string input_n = Inputs(in)[i];
     std::string output_n = Outputs(out)[j];
 
-    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
-                   in, i);
-    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_NE(input_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(output_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
 
     auto *in_var = block_.FindVarRecursive(input_n);
     auto *out_var = block_.FindVarRecursive(output_n);
 
-    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
-                   "The type of %s and %s is not the same.", input_n, output_n);
+    PADDLE_ENFORCE_EQ(
+        in_var->GetType(), out_var->GetType(),
+        platform::errors::InvalidArgument(
+            "The type of input %s and output %s do not match. The input type "
+            "is %s, output type is %s.",
+            input_n, output_n, DataTypeToString(in_var->GetType()),
+            DataTypeToString(out_var->GetType())));
 
     SetDim(output_n, GetDim(input_n));
   }
@@ -101,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", in, i);
-    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+    PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
@@ -119,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size(),
-                      "Input %s of operator %s only has %d elements.", in,
-                      op_.Type(), Inputs(in).size());
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, input "
+                          "variable %s of operator %s only has %d elements.",
+                          in, op_.Type(), Inputs(in).size()));
     PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
-                      "Input %s[%d] of operator %s is @EMPTY@", in, op_.Type(),
-                      i);
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] of operator %s is empty.",
+                          in, i, op_.Type()));
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     PADDLE_ENFORCE_NOT_NULL(
-        in_var, "Input %s[%d] of operator %s should not be nullptr.", in,
-        op_.Type(), i);
+        in_var, platform::errors::NotFound(
+                    "The input variable %s[%d] of operator %s is not found.",
+                    in, i, op_.Type()));
     return in_var->GetLoDLevel();
   }
 
   void SetLoDLevel(const std::string &out, int32_t lod_level,
                    size_t j = 0) const override {
     PADDLE_ENFORCE_LT(j, Outputs(out).size(),
-                      "Output %s of operator %s only has %d elements.", out,
-                      op_.Type(), Outputs(out).size());
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, output "
+                          "variable %s of operator %s only has %d elements.",
+                          out, op_.Type(), Outputs(out).size()));
     PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
-                      "Output %s[%d] of operator %s is @EMPTY@", out,
-                      op_.Type(), j);
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] of operator %s is empty.",
+                          out, j, op_.Type()));
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     PADDLE_ENFORCE_NOT_NULL(
-        out_var, "Output %s[%d] of operator %s should not be nullptr.", out,
-        op_.Type(), j);
+        out_var, platform::errors::NotFound(
+                     "The output variable %s[%d] of operator %s is not found.",
+                     out, j, op_.Type()));
     if (lod_level >= 0) {
       out_var->SetLoDLevel(lod_level);
     }
@@ -175,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   DDim GetInputDim(const std::string &name) const override {
     const std::vector<std::string> &arg_names = Inputs(name);
     PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
+                      platform::errors::InvalidArgument(
+                          "The input(%s) should hold only one element, but now "
+                          "it holds %d elements.",
+                          name, arg_names.size()));
     return this->GetDim(arg_names[0]);
   }
 
@@ -200,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   void SetOutputDim(const std::string &name, const DDim &dim) override {
     auto arg_names = Outputs(name);
     PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
+                      platform::errors::InvalidArgument(
+                          "The iutput(%s) should hold only one element, but "
+                          "now it holds %d elements.",
+                          name, arg_names.size()));
     SetDim(arg_names[0], dim);
   }
 
@@ -227,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   DDim GetDim(const std::string &name) const {
     auto var = block_.FindVarRecursive(name);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found.", name));
     DDim res;
     try {
       auto shape = var->GetShape();
@@ -253,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   void SetDims(const std::vector<std::string> &names,
                const std::vector<DDim> &dims) {
     size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The input variables number(%d) and input dimensions "
+                          "number(%d) do not match.",
+                          length, dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (names[i] == framework::kEmptyVarName) {
         continue;
@@ -339,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() {
 
 const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
   auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
-                 Type());
+  PADDLE_ENFORCE_NE(
+      it, inputs_.end(),
+      platform::errors::NotFound("Input %s cannot be found in operator %s.",
+                                 name, Type()));
   return it->second;
 }
 
@@ -360,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name,
 
 const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
-                 name, Type());
+  PADDLE_ENFORCE_NE(
+      it, outputs_.end(),
+      platform::errors::NotFound("Output %s cannot be found in operator %s.",
+                                 name, Type()));
   return it->second;
 }
 
@@ -402,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
 
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
   return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
@@ -467,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         return;
       }
       default:
-        PADDLE_THROW("Wrong attr type %d", attr.type());
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported attribute type (code %d).", attr.type()));
     }
     need_update_ = true;
     return;
@@ -504,7 +578,8 @@ void OpDesc::SetAttrMap(
 
 Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
   return it->second;
 }
 
@@ -518,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr(
     }
   }
 
-  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+  PADDLE_THROW(platform::errors::NotFound(
+      "Attribute %s is not found in proto %s.", name, proto.type()));
 }
 
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
@@ -532,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
 
 std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
   auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second);
 
   std::vector<int> ids;
@@ -545,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
 
 int OpDesc::GetBlockAttrId(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
   return BOOST_GET_CONST(BlockDesc *, it->second)->ID();
 }
 
@@ -632,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     VectorToRepeated(v, attr_->mutable_longs());
   }
 
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method of SetAttrDescVisitor object for "
+        "`boosst::blank` type."));
+  }
 };
 
 void OpDesc::Flush() {
@@ -666,8 +752,9 @@ void OpDesc::Flush() {
 }
 
 void OpDesc::CheckAttrs() {
-  PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is set.");
+  PADDLE_ENFORCE_EQ(Type().empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "CheckAttrs() can not be called before type is set."));
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
@@ -682,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const {
   try {
     VLOG(3) << "CompileTime infer shape on " << Type();
     auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-    PADDLE_ENFORCE(static_cast<bool>(infer_shape),
-                   "%s's infer_shape has not been registered", this->Type());
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(infer_shape), true,
+        platform::errors::NotFound(
+            "Operator %s's infer_shape is not registered.", this->Type()));
     CompileTimeInferShapeContext ctx(*this, block);
     if (VLOG_IS_ON(10)) {
       std::ostringstream sout;
@@ -733,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
+                                     "Input(%s) should have only one value, "
+                                     "but it has %d values now.",
+                                     name, length));
   return block_.HasVarRecursive(input_names[0]);
 }
 
@@ -749,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
+                                     "Output(%s) should have only one value, "
+                                     "but it has %d values now.",
+                                     name, length));
   return block_.HasVarRecursive(output_names[0]);
 }
 
@@ -801,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs(
 std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
     const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<DDim> res;
   try {
     auto shapes = var->GetShapes();
@@ -823,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 void CompileTimeInferShapeContext::SetRepeatedDims(
     const std::string &name, const std::vector<DDim> &dims) {
   auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<std::vector<int64_t>> dim_vec(dims.size());
   std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
   var->SetShapes(dim_vec);
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 0f842637a58e0897e8b68fe06d1e712ffd20ad97..d8159d6a5c294b85d8d5ab9bbee3b95a5eba793f 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                               customized_name,                     \
                               customized_type_value,               \
@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11b7224e683402264573019e1541c5645a3a7514
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..79b15fc87d0b0a0ade8324710b80af634ff8878f
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <boost/any.hpp>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+struct OpUpdateRecord {
+  enum class Type {
+    kInvalid = 0,
+    kModifyAttr,
+    kNewAttr,
+    kNewInput,
+    kNewOutput,
+    kBugfixWithBehaviorChanged,
+  };
+  Type type_;
+  std::string remark_;
+};
+
+struct ModifyAttr : OpUpdateRecord {
+  ModifyAttr(const std::string& name, const std::string& remark,
+             const boost::any& default_value)
+      : OpUpdateRecord({Type::kModifyAttr, remark}),
+        name_(name),
+        default_value_(default_value) {
+    // TODO(Shixiaowei02): Check the data type with proto::OpDesc.
+  }
+
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+
+struct NewAttr : OpUpdateRecord {
+  NewAttr(const std::string& name, const std::string& remark,
+          const boost::any& default_value)
+      : OpUpdateRecord({Type::kNewAttr, remark}),
+        name_(name),
+        default_value_(default_value) {}
+
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+
+struct NewInput : OpUpdateRecord {
+  NewInput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct NewOutput : OpUpdateRecord {
+  NewOutput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct BugfixWithBehaviorChanged : OpUpdateRecord {
+  explicit BugfixWithBehaviorChanged(const std::string& remark)
+      : OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
+};
+
+class OpVersionDesc {
+ public:
+  OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
+                            boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::ModifyAttr(name, remark, default_value)));
+    return *this;
+  }
+
+  OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
+                         boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewAttr(name, remark, default_value)));
+    return *this;
+  }
+
+  OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewInput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewOutput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::BugfixWithBehaviorChanged(remark)));
+    return *this;
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
+};
+
+class OpVersion {
+ public:
+  OpVersion& AddCheckpoint(const std::string& note,
+                           const OpVersionDesc& op_version_desc) {
+    checkpoints_.push_back(Checkpoint({note, op_version_desc}));
+    return *this;
+  }
+
+ private:
+  struct Checkpoint {
+    std::string note_;
+    OpVersionDesc op_version_desc_;
+  };
+  std::vector<Checkpoint> checkpoints_;
+};
+
+class OpVersionRegistrar {
+ public:
+  static OpVersionRegistrar& GetInstance() {
+    static OpVersionRegistrar instance;
+    return instance;
+  }
+  OpVersion& Register(const std::string& op_type) {
+    if (op_version_map_.find(op_type) != op_version_map_.end()) {
+      PADDLE_THROW("'%s' is registered in operator version more than once.",
+                   op_type);
+    }
+    op_version_map_.insert({op_type, OpVersion()});
+    return op_version_map_[op_type];
+  }
+
+ private:
+  std::unordered_map<std::string, OpVersion> op_version_map_;
+
+  OpVersionRegistrar() = default;
+  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
+};
+
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
+
+#define REGISTER_OP_VERSION(op_type)                                       \
+  static paddle::framework::compatible::OpVersion                          \
+      RegisterOpVersion__##op_type =                                       \
+          paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
+              .Register(#op_type)
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80ad51ad07b5a84cfabb3ace9b478b1f6ea24f95
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+TEST(test_operator_version, test_operator_version) {
+  REGISTER_OP_VERSION(test__)
+      .AddCheckpoint(
+          R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
+          framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
+              "Support the case of axis < 0"))
+      .AddCheckpoint(
+          R"ROC(
+        Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .ModifyAttr("axis",
+                          "Increased from the original one method to two.", -1)
+              .NewAttr("size",
+                       "In order to represent a two-dimensional rectangle, the "
+                       "parameter size is added.",
+                       0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a new attribute [height]
+      )ROC",
+          framework::compatible::OpVersionDesc().NewAttr(
+              "height",
+              "In order to represent a two-dimensional rectangle, the "
+              "parameter height is added.",
+              0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a input [X2] and a output [Y2]
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .NewInput("X2", "The second input.")
+              .NewOutput("Y2", "The second output."));
+}
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9c293bcdb852ff1ab5b1494838ee2c947cd372cc..ca2705f154c4f45dfccd954b23209c71701adce5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/operator.h"
+
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
@@ -20,18 +22,21 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -163,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::SetDeviceId(dev_id);
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Cannot run operator on place %s", place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::SetXPUDeviceId(dev_id);
 #endif
     }
 
@@ -604,6 +617,29 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return op_.Outputs(name);
   }
 
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
@@ -1084,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing XPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 709f132813c7da23bc2ab77f7cfb586d4d11edbf..ebecbf0498c384a55627e2b5cb31304d098a444c 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
-/// Variables with this suffix are the loaded from pre-train model.
-constexpr char kLoadedVarSuffix[] = "@LOADED";
-
 /// RuntimeContext is used to relate input/output names of Operator with
 /// the corresponding variables in name scope.
 /// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8c6dd628bb9748bb120c1c39841e199659fb53fc..12e0f97f1262ca0f6bf8fc70ab5b482fb0bdd305 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "XPU is not supported in ParallelExecutor"));
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
   member_->use_cuda_ = exec_strategy.use_cuda_;
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 919378c929185b12826c8b427d0e9a86a382bb2b..274b0ca0d903d4e89c7bceb74bc16581f03bb584 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
       should_run.push_back(true);
     } else {
       should_run.push_back(false);
+      // If the output of an op modifies feed vars, the op should not clip.
+      // For example, in the transformer structure, the third parameter returned
+      // by beam_search op is generally assigned to a feed var. Cutting the
+      // assign op will cause an error.
+      if (parent_block_id != -1) {
+        bool flag = false;
+        for (auto& var : op_desc.outputs()) {
+          for (auto& argu : var.arguments()) {
+            if (feed_var_names.count(argu)) {
+              flag = true;
+            }
+          }
+        }
+        if (flag) {
+          should_run.back() = true;
+        }
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index eb5c241a8372a460483c70e38f962168b1cdbbc0..12fa0c61f8121d475a0cf2aa78e4bb995a01b132 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) {
   EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
   EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
 }
+
+// If the output of an op modifies feed vars, the op should not clip.
+TEST(Prune, recurrrent_op_2) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::BlockDesc *sub_block = program.AppendBlock(*block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+
+  std::vector<std::string> state_var_name(1, "y");
+  AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
+        {{"ex_states", state_var_name},
+         {"states", state_var_name},
+         {"sub_block", sub_block}},
+        block);
+
+  EXPECT_TRUE(sub_block != nullptr);
+  AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"a"}}},
+        f::AttributeMap{}, sub_block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  std::set<std::string> feed_var_names = {"x", "a"};
+
+  f::Prune(*pdesc, feed_var_names, &pruned);
+  EXPECT_EQ(pruned.blocks_size(), 2);
+  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
+  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
+}
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 7ce8deb7cfc70d39de52e1fd9e5bace969f854e7..8d8a8f01b3f38c82a480bf7204721481586cc860 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -52,7 +53,8 @@ class InferShapeContext {
                              const std::vector<DDim> &dims) = 0;
   virtual void SetReaderDims(const std::string &name,
                              const std::vector<DDim> &dims);
-
+  virtual std::string GetInputNameByIdx(size_t idx) const = 0;
+  virtual std::string GetOutputNameByIdx(size_t idx) const = 0;
   virtual AttrReader Attrs() const = 0;
   virtual std::vector<std::string> Inputs(const std::string &name) const = 0;
   virtual std::vector<std::string> Outputs(const std::string &name) const = 0;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 50637a0c3d3f9c6975578e94e6ddc2c898c926e0..c3626c5c9e0506f12ca77aac5086cb18e272a771 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -54,14 +54,43 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                  size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -71,8 +100,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -82,8 +112,32 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
+    auto dst_cuda_pinned_place =
+        BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Device context place mismatch. When copying Tensor "
+                          "data from GPU memory to CUDA Pinned memory, current "
+                          "device context place should be GPU."));
+    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
+                      platform::errors::PreconditionNotMet(
+                          "The source GPU device and current device context do "
+                          "not match. The source GPU device number is %d, but "
+                          "device context GPU number is %d.",
+                          src_gpu_place.device, ctx_gpu_place.device));
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cuda_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
@@ -104,8 +158,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size,
                  stream);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -128,7 +183,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
       }
     }
-  } else {
+  }
+  else {  // NOLINT
     PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
@@ -174,35 +230,74 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                  size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                  nullptr);
-  } else {
+  }
+  else {  // NOLINT
     PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
@@ -241,6 +336,19 @@ class AnyVisitor : public boost::static_visitor<bool> {
   const framework::Tensor& tensor_;
   Predicate predicate_;
 
+  bool GetResultHelper(const framework::Tensor& out,
+                       const platform::Place& place) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto ctx = platform::DeviceContextPool::Instance().Get(place);
+    ctx->Wait();
+    TensorCopy(out, cpu, *ctx, &tmp);
+    ctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
  public:
   AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
       : tensor_(tensor), predicate_(std::move(predicate)) {}
@@ -255,17 +363,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
     return this->GetResult(out, place);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::XPUPlace& xpu) const {
+    return GetResultHelper(out, xpu);
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    TensorCopy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
+    return GetResultHelper(out, gpu);
   }
 
   bool GetResult(const framework::Tensor& out,
@@ -315,6 +420,61 @@ inline void Any(const framework::Tensor& tensor, Predicate predicate,
   platform::VisitPlace(place, visitor);
 }
 
+template <typename Predicate, typename DevCtx>
+struct AllDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AllDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void apply() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenVector<bool>::Flatten(*out_);
+    o.device(*ctx_.eigen_device()) = predicate_(t);
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AllImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(tensor.type(), AllDTypeVisitor<Predicate, DevCtx>(
+                                   predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+class AllOutVisitor : public boost::static_visitor<> {
+ private:
+  const framework::Tensor& tensor_;
+  mutable framework::Tensor* out_;
+  Predicate predicate_;
+
+ public:
+  AllOutVisitor(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out)
+      : tensor_(tensor), out_(out), predicate_(predicate) {}
+
+  template <typename Place>
+  void operator()(const Place& place) const {
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    out_->Resize(tensor_.dims());
+    out_->mutable_data<bool>(place);
+    AllImpl(predicate_, tensor_, *ctx, out_);
+  }
+};
+
+template <typename Predicate>
+inline void All(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out) {
+  AllOutVisitor<Predicate> visitor(tensor, predicate, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 struct ContainsNANPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -335,6 +495,12 @@ void TensorContainsNAN(const framework::Tensor& tensor,
   Any(tensor, predicate, out);
 }
 
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out) {
+  ContainsNANPredicate predicate;
+  All(tensor, predicate, out);
+}
+
 struct ContainsInfPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -355,6 +521,12 @@ void TensorContainsInf(const framework::Tensor& tensor,
   Any(tensor, predicate, out);
 }
 
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out) {
+  ContainsInfPredicate predicate;
+  All(tensor, predicate, out);
+}
+
 // NOTE(dzhwinter):
 // Isfinite need a AllVisitor to loop through all the elements.
 // We choose two cuda call instead of one allvisitor. The AllVisitor
@@ -367,8 +539,8 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
 
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
-static inline void __global__ BothFalse(const T* cmp, T* out) {
-  out[0] = (!cmp[0]) && (!out[0]);
+static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
+  CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
 }
 #endif
 
@@ -383,25 +555,47 @@ struct BothFalseVisitor : public boost::static_visitor<> {
     VisitorImpl(place);
   }
 
+  void VisitorImpl(const platform::XPUPlace& xpu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+  }
+
   void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #ifdef PADDLE_WITH_CUDA
     auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
-    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
-                                                out_->mutable_data<bool>(gpu));
+    constexpr int MAX_BLOCK_DIM = 512;
+    const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int element_num = in_.numel();
+    int block_size = (element_num >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(element_num)));
+    int grid_size = element_num / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    BothFalse<bool><<<grid_size, block_size, 0, ctx->stream()>>>(
+        in_.data<bool>(), out_->mutable_data<bool>(gpu), element_num);
 #endif
   }
 
   void VisitorImpl(const platform::CPUPlace& cpu) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
   }
 
   void VisitorImpl(
       const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
   }
 };
 
@@ -414,6 +608,15 @@ void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
   platform::VisitPlace(place, visitor);
 }
 
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out) {
+  framework::Tensor tmp;
+  TensorContainsInfV2(tensor, &tmp);
+  TensorContainsNANV2(tensor, out);
+  BothFalseVisitor visitor(tmp, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                     const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -463,6 +666,28 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported when not compiled with CUDA"));
+#endif
+    } else if (platform::is_xpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_XPU
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& xpu_dev_ctx =
+          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::XPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write);
+        xpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "XPUPlace is not supported when not compiled with XPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -517,8 +742,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     void* buf;
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -528,8 +754,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
     } else {
       framework::VisitDataType(
@@ -568,8 +799,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     void* buf;
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -579,8 +811,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
     } else {
       framework::VisitDataType(
@@ -665,6 +902,9 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
         reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+#endif
 }
 
 template <typename T>
@@ -673,10 +913,20 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
   auto element_num = tensor.numel();
 
   os << "  - data: [";
-  if (element_num > 0) {
-    os << inspect[0];
-    for (int j = 1; j < element_num; ++j) {
-      os << " " << inspect[j];
+  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
+  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
+    if (element_num > 0) {
+      os << signed(inspect[0]);
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << signed(inspect[j]);
+      }
+    }
+  } else {
+    if (element_num > 0) {
+      os << inspect[0];
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << inspect[j];
+      }
     }
   }
   os << "]";
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index c71327da64042aed85f1247f3c31de3e66a588ba..fce0142b41d3ae9b2a6fcd4f16d38b0492fbd806 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
                       const platform::DeviceContext& dev_ctx,
                       const size_t& seek, const std::vector<int64_t>& shape);
 
+// store the bool result tensor in out tensor
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
+
 // convert dlpack's DLTensor to tensor
 void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
 
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 4d602d5c0211e221a99e0e87a3344c5a9c2a0142..3d01e4fe46f10f1c9494026ca1cb21496ed6fe6b 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) 
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
 add_subdirectory(jit)
-
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
+cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c980b014b823e21f117bc6e44037349b06a1fdfd
--- /dev/null
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/variable_wrapper.h"
+
+namespace paddle {
+namespace imperative {
+
+AmpOperators::AmpOperators()
+    : allow_ops_(new std::unordered_set<std::string>()),
+      block_ops_(new std::unordered_set<std::string>()) {}
+AmpOperators::~AmpOperators() {}
+
+AmpOperators& AmpOperators::Instance() {
+  static AmpOperators instance;
+  return instance;
+}
+
+std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetAllowOps() {
+  return allow_ops_;
+}
+
+std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetBlockOps() {
+  return block_ops_;
+}
+
+inline std::string GetDtypeStr(
+    const std::shared_ptr<imperative::VarBase>& var) {
+  return framework::DataTypeToString(var->DataType());
+}
+
+inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
+  if (!platform::is_gpu_place(var->Place())) {
+    return false;
+  }
+  if (var->DataType() == framework::proto::VarType::FP32 ||
+      var->DataType() == framework::proto::VarType::FP16) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
+// var will be cast back from fp16 to fp32 during backward phase.
+static inline std::shared_ptr<imperative::VarBase> CastToType(
+    const std::shared_ptr<VarBase>& var,
+    const framework::proto::VarType::Type dst_type) {
+  const auto& tracer = imperative::GetCurrentTracer();
+  imperative::NameVarBaseMap ins = {{"X", {var}}};
+  framework::AttributeMap attrs = {{"in_dtype", var->DataType()},
+                                   {"out_dtype", dst_type}};
+  auto out = std::shared_ptr<imperative::VarBase>(
+      new imperative::VarBase(tracer->GenerateUniqueName()));
+  imperative::NameVarBaseMap outs = {{"Out", {out}}};
+
+  {
+    AutoCastGuard guard(tracer, false);
+    tracer->TraceOp("cast", ins, outs, std::move(attrs));
+  }
+
+  return out;
+}
+
+static inline std::shared_ptr<imperative::VarBase> CastToFP16(
+    const std::shared_ptr<VarBase>& var) {
+  auto dst_type = framework::proto::VarType::FP16;
+  if (NeedCast(var) && (var->DataType() != dst_type)) {
+    return CastToType(var, dst_type);
+  }
+  return var;
+}
+
+static inline std::shared_ptr<imperative::VarBase> CastToFP32(
+    const std::shared_ptr<VarBase>& var) {
+  auto dst_type = framework::proto::VarType::FP32;
+  if (NeedCast(var) && (var->DataType() != dst_type)) {
+    return CastToType(var, dst_type);
+  }
+  return var;
+}
+
+static inline framework::proto::VarType::Type GetPromoteType(
+    const NameVarBaseMap& ins) {
+  auto dst_type = framework::proto::VarType::FP16;
+  for (const auto& pair : ins) {
+    for (const auto& var : pair.second) {
+      if (var->DataType() == framework::proto::VarType::FP32) {
+        dst_type = var->DataType();
+        break;
+      }
+    }
+  }
+  return dst_type;
+}
+
+NameVarBaseMap AutoCastInputs(const std::string& op_type,
+                              const NameVarBaseMap& ins) {
+  NameVarBaseMap new_ins = {};
+  if (AmpOperators::Instance().GetAllowOps()->count(op_type)) {
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to float16";
+      for (const auto& var : pair.second) {
+        auto new_var = CastToFP16(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  } else if (AmpOperators::Instance().GetBlockOps()->count(op_type)) {
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to float";
+      for (const auto& var : pair.second) {
+        auto new_var = CastToFP32(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  } else {
+    auto dst_type = GetPromoteType(ins);
+
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to "
+              << framework::DataTypeToString(dst_type);
+      for (const auto& var : pair.second) {
+        // NOTE(zhiqiu): Conv + BN always occur together, we needn't
+        // cast X of batch_norm to FP32, which is produced by conv as FP16 type.
+        if (op_type == "batch_norm" && pair.first == "X" &&
+            dst_type == framework::proto::VarType::FP32) {
+          new_ins[pair.first].emplace_back(var);
+          continue;
+        }
+        auto new_var = dst_type == framework::proto::VarType::FP32
+                           ? CastToFP32(var)
+                           : CastToFP16(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  }
+  return ins;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1da97e5a39057aed3ed0b4a450bd4a4f5c06984
--- /dev/null
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+
+namespace paddle {
+namespace imperative {
+
+// Singleton implementation with C++ 11
+class AmpOperators {
+ public:
+  ~AmpOperators();
+  AmpOperators(const AmpOperators& o) = delete;
+  const AmpOperators& operator=(const AmpOperators& o) = delete;
+
+  static AmpOperators& Instance();
+
+  std::shared_ptr<std::unordered_set<std::string>> GetAllowOps();
+
+  std::shared_ptr<std::unordered_set<std::string>> GetBlockOps();
+
+ private:
+  AmpOperators();  // forbid calling default constructor
+
+  // The set of ops that support fp16 calculation and are considered numerically
+  // safe and performance critical. These ops are always converted to fp16.
+  std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
+
+  // The set of ops that support fp16 calculation and are considered numerically
+  // dangerous and whose effects may also be observed in downstream ops.
+  std::shared_ptr<std::unordered_set<std::string>> block_ops_;
+};
+
+// NOTE(zhiqiu): AutoCastGuard is used for RAII.
+class AutoCastGuard {
+ public:
+  AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
+      : tracer_(tracer) {
+    pre_mode_ = tracer_->IsAutoCastEnabled();
+    if (pre_mode_ != guard_mode) {
+      tracer_->SetEnableAutoCast(guard_mode);
+    }
+  }
+
+  ~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
+
+  // forbid copy and operator=
+  AutoCastGuard(const AutoCastGuard& guard) = delete;
+  AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
+
+ private:
+  std::shared_ptr<Tracer> tracer_;
+  bool pre_mode_;
+};
+
+NameVarBaseMap AutoCastInputs(const std::string& op_type,
+                              const NameVarBaseMap& ins);
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index de1246883f1019bc3e6adabadbc9e071926eb772..a91f14e56b719515bfd4d07896648e596a2282dd 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -30,12 +30,13 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy,
-                       bool retain_graph) {
-  backward_strategy_ = strategy;
+void BasicEngine::Init(VarBase* var, bool retain_graph) {
+  sorted_sum_gradient_ = FLAGS_sort_sum_gradient;
   retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   var->GradVarBase()->ClearGradNode();
@@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
 
       auto& accumulator = accumulators_[var.get()];
       if (!accumulator) {
-        if (backward_strategy_.sorted_sum_gradient_) {
+        if (sorted_sum_gradient_) {
           accumulator.reset(new SortedGradientAccumulator(var.get()));
         } else {
           accumulator.reset(new EagerGradientAccumulator(var.get()));
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 4d25d81235098cca37491b1d8e43b481adc2fd0a..d1aa69f16868d3bcc67458330594dd149564c0bf 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -18,7 +18,6 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
@@ -30,8 +29,7 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, const detail::BackwardStrategy& strategy,
-            bool retain_graph = false);
+  void Init(VarBase* var, bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +44,7 @@ class BasicEngine : public Engine {
 
  private:
   std::shared_ptr<GradOpNode> init_node_;
-  detail::BackwardStrategy backward_strategy_;
+  bool sorted_sum_gradient_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
       accumulators_;
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index f5fc5944709fc94ef23b878a5f58c9cb1dfed63a..7caeb4378ce3d1ca1d1557054642c9fa184bea39 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
     blas.AXPY(numel_, 1., x_, y_);
   }
 
+  void operator()(const platform::XPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
 #ifdef PADDLE_WITH_CUDA
   void operator()(const platform::CUDAPlace& place) {
     platform::CUDADeviceContext* ctx =
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 65ac570bc7aa07a1a06e9deffcf797d6ef5d2519..fcd4545a2c82d3c64f8d8d8683438aaf0e6a2719 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -16,7 +16,9 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -32,8 +34,12 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
  public:
   DygraphInferShapeContext(const NameVarMap<VarType>* in,
                            const NameVarMap<VarType>* out,
-                           const framework::AttributeMap* attr)
-      : var_base_map_in_(in), var_base_map_out_(out), attrs_(attr) {}
+                           const framework::AttributeMap* attr,
+                           const std::string op_type)
+      : var_base_map_in_(in),
+        var_base_map_out_(out),
+        attrs_(attr),
+        op_type_(op_type) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -135,6 +141,28 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
     return vec_res;
   }
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_type_, idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_type_, idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
 
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
@@ -367,6 +395,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   const NameVarMap<VarType>* var_base_map_in_;
   const NameVarMap<VarType>* var_base_map_out_;
   const framework::AttributeMap* attrs_;
+  const std::string op_type_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 17a0b5e0431d44176b896fa1b5df4f88cadafe9f..03e83301d44a35dc98e9a1aee0e1b22ef2380d50 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -186,6 +186,8 @@ class VarBase {
 
   framework::proto::VarType::Type DataType() const { return var_->DataType(); }
 
+  const platform::Place Place() const { return var_->Place(); }
+
   void ClearGradient();
 
   std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 0b45c189dd714adedc1fb1600e2b350c3dedb62b..3afe5af7f6348654c4cad3d44952cef43ba93f7e 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -33,6 +33,8 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
@@ -529,8 +531,7 @@ class PartialGradTask {
                   const std::vector<std::shared_ptr<VarBase>> &output_targets,
                   const std::vector<std::shared_ptr<VarBase>> &output_grads,
                   const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                  const platform::Place &place,
-                  const detail::BackwardStrategy &strategy, bool create_graph,
+                  const platform::Place &place, bool create_graph,
                   bool retain_graph, bool allow_unused, bool only_inputs);
 
   std::vector<std::shared_ptr<VarBase>> Run();
@@ -577,7 +578,7 @@ class PartialGradTask {
   bool retain_graph_;
   bool allow_unused_;
   bool only_inputs_;
-  detail::BackwardStrategy strategy_;
+  bool sorted_sum_gradient_{FLAGS_sort_sum_gradient};
 };
 
 PartialGradTask::PartialGradTask(
@@ -585,15 +586,14 @@ PartialGradTask::PartialGradTask(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) {
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs) {
   input_targets_ = input_targets;
   place_ = place;
   create_graph_ = create_graph;
   retain_graph_ = retain_graph;
   allow_unused_ = allow_unused;
   only_inputs_ = only_inputs;
-  strategy_ = strategy;
 
   PADDLE_ENFORCE_EQ(only_inputs_, true,
                     platform::errors::Unimplemented(
@@ -887,7 +887,10 @@ void PartialGradTask::RunEachOp(OpBase *op) {
                                              op->Attrs(), op->place());
     PADDLE_ENFORCE_NOT_NULL(
         double_grad_node,
-        platform::errors::NotFound("The Op %s doesn't have any grad op.",
+        platform::errors::NotFound("The Op %s doesn't have any grad op. If you "
+                                   "don't intend calculating higher order "
+                                   "derivatives, please set `create_graph` to "
+                                   "False.",
                                    op->Type()));
     VLOG(10) << "Create " << double_grad_node->size()
              << " double grad op(s) for " << op->Type()
@@ -978,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
 
       if (!accumulator) {
         accumulator.reset(new GradientAccumulationInfo(
-            var, strategy_.sorted_sum_gradient_, create_graph_));
+            var, sorted_sum_gradient_, create_graph_));
       }
 
       accumulator->IncreaseTotalRefCnt();
@@ -1030,11 +1033,11 @@ PartialGradEngine::PartialGradEngine(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs)
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs)
     : task_(new PartialGradTask(input_targets, output_targets, output_grads,
-                                no_grad_vars, place, strategy, create_graph,
-                                retain_graph, allow_unused, only_inputs)) {}
+                                no_grad_vars, place, create_graph, retain_graph,
+                                allow_unused, only_inputs)) {}
 
 PartialGradEngine::~PartialGradEngine() { Clear(); }
 
diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h
index a7f28c49ec3950674cd43127f51934089a497412..b5da39f8d4237130fd4674eacb479aaf6b9ba348 100644
--- a/paddle/fluid/imperative/partial_grad_engine.h
+++ b/paddle/fluid/imperative/partial_grad_engine.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
                     const std::vector<std::shared_ptr<VarBase>> &output_targets,
                     const std::vector<std::shared_ptr<VarBase>> &output_grads,
                     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                    const platform::Place &place,
-                    const detail::BackwardStrategy &strategy, bool create_graph,
+                    const platform::Place &place, bool create_graph,
                     bool retain_graph, bool allow_unused, bool only_inputs);
 
   ~PartialGradEngine();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index def5c860449214ad4a08fd69ff575b91d6f162a0..4e0e95dd012976c292b4511e9707802c210dc599 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/prepared_operator.h"
+
 #include <sstream>
+
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
@@ -40,23 +42,17 @@ static void PrepareData(const platform::Place& place,
     for (const auto& var_base : name_pair.second) {
       const auto* tensor = GetTensorFromVar(var_base->Var());
       if (tensor && tensor->IsInitialized()) {
-        auto tmp_place = tensor->place();
-
-        // TODO(jiabin): Support transform data layout when we Verify it on more
-        // tests
-        if (!(tmp_place == place)) {
-          auto kernel_type_for_var = op.GetKernelTypeForVar(
-              name_pair.first, *tensor, expected_kernel_key);
-          if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-            continue;
-          } else {
-            VLOG(3) << "Transform Variable " << var_base->Name() << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            framework::Tensor out;
-            TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
-                          &out);
-            SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
-          }
+        auto kernel_type_for_var = op.GetKernelTypeForVar(
+            name_pair.first, *tensor, expected_kernel_key);
+        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+          continue;
+        } else {
+          VLOG(3) << "Transform Variable " << var_base->Name() << " from "
+                  << kernel_type_for_var << " to " << expected_kernel_key;
+          framework::Tensor out;
+          TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
+                        &out);
+          SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
         }
       }
     }
@@ -91,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
 
   framework::RuntimeContext ctx({}, {});
+#ifdef PADDLE_WITH_MKLDNN
+  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
+  // GetKernelType functions, so we need to copy the attributes there.
+  // Const qualifier of Attrs had to be discarded to overwrite it.
+  auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
+  mutable_op_attrs = attrs;
+#endif
   auto expected_kernel_key =
       op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
           op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
@@ -137,7 +147,8 @@ static void PreparedOpRunImpl(
   // TODO(zjl): remove scope in dygraph
   framework::Scope scope;
 
-  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs);
+  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
+                                                    op.Type());
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
index a231e16100b9f6b153beffe7c66de6fc6813414e..4a30ffb7e3d01ffa90a42278e2e5ef5271045d8a 100644
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -17,9 +17,11 @@
 //
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
@@ -384,7 +386,7 @@ TEST(test_layer, test_dygraph_infershape_context) {
   concat_att_map["axis"] = 1;
 
   DygraphInferShapeContext<imperative::VarBase> infer_shape_ctx(
-      &ins, &outs, &concat_att_map);
+      &ins, &outs, &concat_att_map, "dummy");
 
   bool have_x = infer_shape_ctx.HasOutputs("Out");
   ASSERT_EQ(have_x, true);
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index c2e30b45a7f6c06ee6eb8945922a4317e9060491..f226c63f0c432e3878c7df6a5a04433ce047ff26 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) {
 }
 #endif
 
-TEST(test_prepare_op, test_prepare_data_same_place) {
+void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
   std::shared_ptr<imperative::VarBase> vin(
       new imperative::VarBase(false, "vin"));
   std::shared_ptr<imperative::VarBase> vout(
@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
   var_pair out_pair = var_pair("Out", vb_vector(1, vout));
   imperative::NameVarBaseMap ins = {x_pair};
   imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap attr_map;
   const std::string op_type = "relu";
   const auto& info = framework::OpInfoMap::Instance().Get(op_type);
   if (info.Checker()) info.Checker()->Check(&attr_map);
@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
     }
   }
 }
+
+TEST(test_prepare_op, test_prepare_data_same_place) {
+  TestPrepareDataSamePlace({});
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
+  TestPrepareDataSamePlace({{"use_mkldnn", true}});
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
 
 USE_OP(split);
 USE_OP(relu);
+#ifdef PADDLE_WITH_MKLDNN
+USE_OP_DEVICE_KERNEL(relu, MKLDNN);
+#endif
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 3c3ec2e6263396881597649d3ab643b5492d630a..892acffb712d9734e525a403881fda47ca0df23a 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   framework::AttributeMap reduce_attr_map;
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get(), back_st);
+  engine.Init(reduce_sum_out.get());
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(vout.get(), back_st);
+  engine.Init(vout.get());
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index ee4c5617397b39d6847fecd1c884af8b0e14440f..d09cb03360363088bb021285af4574ffbbb81ef0 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -16,6 +16,7 @@
 #include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -53,8 +54,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
     attr_checker->Check(&attrs, true);
   }
 
+  NameVarBaseMap new_ins = ins;
+  if (enable_autocast_) {
+    VLOG(5) << "Auto mixed precision run operator: " << type;
+    new_ins = AutoCastInputs(type, ins);
+  }
+
   try {
-    OpBase::Run(*op, ins, outs, attrs, place);
+    OpBase::Run(*op, new_ins, outs, attrs, place);
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
     throw std::move(exception);
@@ -73,11 +80,11 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
 
   if (enable_program_desc_tracing_) {
     VLOG(5) << "Trace op " << type << " into ProgramDesc";
-    program_desc_tracer_->InsertOp(type, ins, outs, attrs);
+    program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
   }
 
-  if (ComputeRequiredGrad(ins, outs, trace_backward)) {
-    CreateGradOpNode(*op, ins, outs, attrs, place);
+  if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
+    CreateGradOpNode(*op, new_ins, outs, attrs, place);
   } else {
     VLOG(3) << "No Grad to track for Op: " << type;
   }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 7652b3aa291ac0063fcc411b5f86f6084f01e8ef..71996b3e1ac998be2c4cd3765591b640765089a0 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -97,6 +97,10 @@ class Tracer {
 
   void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
 
+  void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
+
+  bool IsAutoCastEnabled() const { return enable_autocast_; }
+
  private:
   std::unique_ptr<BasicEngine> basic_engine_;
   std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
@@ -104,6 +108,7 @@ class Tracer {
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   bool has_grad_{true};
+  bool enable_autocast_{false};
 };
 
 // To access static variable current_tracer
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 9c2ff39e8675fbe1ca3777731a5d9408bfc765b3..d730ddc12d1053910a36b8491c2ce983f60b3648 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -111,6 +111,28 @@ class VariableWrapper {
     }
   }
 
+  const platform::Place Place() const {
+    const framework::Tensor* tensor = nullptr;
+    auto place =
+        platform::CPUPlace();  // Default place for var not initialized.
+    if (var_.IsInitialized()) {
+      if (type_ == framework::proto::VarType::LOD_TENSOR) {
+        tensor = &(var_.Get<framework::LoDTensor>());
+      } else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
+        tensor = &(var_.Get<framework::SelectedRows>().value());
+      } else {
+        VLOG(6) << "Variable " << name_ << " is not initialized";
+        return place;
+      }
+    }
+    if (tensor && tensor->IsInitialized()) {
+      return tensor->place();
+    } else {
+      VLOG(6) << "The tensor of variable " << name_ << " is not initialized";
+      return place;
+    }
+  }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 1f2734eece578f7ec266a6f31cd46b373f010fc1..98554ed04976670c1a846cbeab69815417c0a998 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
     SRCS analyzer_tester.cc
     EXTRA_DEPS reset_tensor_array paddle_fluid_shared
     ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
-elseif(NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
-  inference_analysis_test(test_analyzer
-    SRCS analyzer_tester.cc
-    EXTRA_DEPS reset_tensor_array paddle_inference_api
-    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+elseif(WIN32)
+    inference_analysis_test(test_analyzer
+      SRCS analyzer_tester.cc
+      EXTRA_DEPS reset_tensor_array paddle_inference_api
+      ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
index 70adb4a974cc5f9911cb302840bbef7ec2591505..9a53ce53ab6a756af666de99c8729bf3da2e4a09 100644
--- a/paddle/fluid/inference/analysis/README.md
+++ b/paddle/fluid/inference/analysis/README.md
@@ -6,13 +6,13 @@ and make the various optimization features be pluggable and co-exist in a pipeli
 
 We borrowed some concepts from LLVM, such as
 
-- [Pass](./pass.h)es to implement optimization that traverse the inference program,
-- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
-- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+- [Pass](../../framework/ir/pass.h)es to implement optimization that traverse the inference program,
+- [Graph](../../framework/ir/graph.h) to represent the data flow graph built from a program,
+- [PassManager](./ir_pass_manager.h) to manage a sequence of `Pass`es over a graph.
 
 There are some other basic concepts here
 
-- [Node](./node.h), the node in a `DataFlowGraph`,
+- [Node](../../framework/ir/node.h), the node in a `Graph`,
   - `Function`, the Operator in Fluid,
   - `Value`, the Variable in Fluid;
 - [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
@@ -21,9 +21,9 @@ There are some other basic concepts here
 
 The `inference/analysis` module make all the passes in a pipeline, and works in such way:
 
-1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
-2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
-3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+1. Build a `Graph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `Graph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `Graph`.
 
 The new optimization features can be added as an independent `Pass` and controlled by gflags,
 each pass will generate unified debug information or visualization for better debugging.
@@ -54,5 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass.
 There is some helper legacy/function/class for analysis.
 
 - [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
-- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
-there are some implementations in  [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
+- [graph_traits.h](../../framework/ir/graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
+there are some implementations in  [graph_helper.cc](../../framework/ir/graph_helper.cc) , such as BFS and DFS..
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 30e8386f4c86e308372b5dd6328c7d3785a073b1..fb0ad31a3e612201de32813a65970c73b73b611b 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -54,8 +54,7 @@ if(WITH_TESTING)
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
     set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
-  elseif(NOT WIN32)
-    # TODO: Fix this unittest failed on Windows
+  elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
@@ -67,8 +66,7 @@ endif()
 if (NOT APPLE AND NOT WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
-elseif (NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
+elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 61886c225e6548413e6e2eb0415f596d016a988f..a1c1e6de5fd44617a30f235a0416d897bf932075 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -218,6 +218,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
   Update();
 }
 
+void AnalysisConfig::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
+  use_mkldnn_bfloat16_ = false;
+#endif
+
+  Update();
+}
+
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
                           "MkldnnQuantizer was not enabled yet.");
@@ -331,6 +342,12 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_mkldnn_bfloat16_) {
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMkldnnBfloat16();
+#endif
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   // Do not optimize when mkldnn is on
   if (enable_memory_optim_ && !use_mkldnn_) {
@@ -399,6 +416,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ";";
 
   ss << use_mkldnn_quantizer_;
+  ss << use_mkldnn_bfloat16_;
   ss << model_from_memory_;
 
   ss << with_profile_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index dea448f9b03468eabda16d4375ea60348a09efb2..5766919f08e68832886b88b867bc48afa288a955 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
 }
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
+  config.EnableUseGpu(100, 0);
+  config.EnableMkldnnBfloat16();
+#ifdef PADDLE_WITH_MKLDNN
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
+#else
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
+#endif
+}
+#endif
+
+TEST(AnalysisPredictor, bf16_pass_strategy) {
+  std::vector<std::string> passes;
+  PassStrategy passStrategy(passes);
+  passStrategy.EnableMkldnnBfloat16();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6a31ff281c68e3675d35c14059a453455ef398df..b1244e4e3dfdd5e6a627054250e6def2a7c35a89 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief Turn on MKLDNN bfloat16.
+  ///
+  ///
+  void EnableMkldnnBfloat16();
+
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+  ///
+  /// \return bool Whether to use the MKLDNN Bfloat16.
+  ///
+  bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
+
   ///
   /// \brief A boolean state telling whether the thread local CUDA stream is
   /// enabled.
@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig {
   int mkldnn_cache_capacity_{0};
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+  bool use_mkldnn_bfloat16_{false};
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c07ac11e278901e9b9475492ca38411dcf8184d3..ffb70700b5f98a51b579a68f746ea1ee6a6d9f7b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
   LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
+void GpuPassStrategy::EnableMkldnnBfloat16() {
+  LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
+}
+
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
@@ -223,4 +227,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 #endif
 }
 
+void CpuPassStrategy::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  use_mkldnn_bfloat16_ = false;
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index c5a4a5f754d031a8e8f88a96dd16c89fbe1b0fbb..9073253520466a3711089bc7b7da04a9191e0a42 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable MKLDNN quantize optimization.
   virtual void EnableMkldnnQuantizer() {}
 
+  /// \brief Enable MKLDNN bfloat16.
+  virtual void EnableMkldnnBfloat16() {}
+
   /// \brief Check if we are using gpu.
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
     use_gpu_ = other.use_gpu_;
     use_mkldnn_ = other.use_mkldnn_;
     use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+    use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
   }
   /// \brief Default destructor.
   virtual ~CpuPassStrategy() = default;
@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable MKLDNN quantize optimization.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Enable MKLDNN bfloat16.
+  void EnableMkldnnBfloat16() override;
+
  protected:
   /// \cond Protected
   bool use_mkldnn_quantizer_{false};
+  bool use_mkldnn_bfloat16_{false};
   /// \endcond
 };
 
@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Not supported in GPU mode yet.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnBfloat16() override;
+
   /// \brief Default destructor.
   virtual ~GpuPassStrategy() = default;
 
diff --git a/paddle/fluid/inference/capi/paddle_c_api.h b/paddle/fluid/inference/capi/paddle_c_api.h
index 4be6b48fb1820dc3271de164e87387c73ee67da9..32129890d02a2a0e0b357a6e0402d07b56bc6509 100644
--- a/paddle/fluid/inference/capi/paddle_c_api.h
+++ b/paddle/fluid/inference/capi/paddle_c_api.h
@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
 PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
     const PD_AnalysisConfig* config);
 
+PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
+    PD_AnalysisConfig* config);
+
+PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
+    const PD_AnalysisConfig* config);
+
 PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
                                                  const char* prog_buffer,
                                                  size_t prog_buffer_size,
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index f5445dd5a3f9b6499045361a36fd6363a79ef560..b99abc06b27ecb9686b4c6e883aaaf8b3e592415 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
   return config->config.mkldnn_quantizer_enabled();
 }
 
+void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
+                                      "PD_AnalysisConfig should not be null"));
+  config->config.EnableMkldnnBfloat16();
+}
+
+bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
+                                      "PD_AnalysisConfig should not be null"));
+  return config->config.mkldnn_bfloat16_enabled();
+}
+
 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                        size_t prog_buffer_size, const char* params_buffer,
                        size_t params_buffer_size) {
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 97d09925b19c4911a6b412518dc58fe88da16f64..10c212c0b4fa394e3c745bf524ef9d081c4bc3c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    CHECK(op_desc.HasAttr("Input_scale"));
+    if (op_desc.Type() != "conv2d_transpose") {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument("Input scale not found. TRT int8"
+                                            " requires conv/deconv to have "
+                                            "input quantization scales."));
+    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 1af2a4c5c73bb38bda7bc8b5b975c96175b6ea44..22d28a44cb8bbb89c85bb94eb8825bcad05bdc26 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
     } else if (shape.size() == 3UL) {
       return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
     }
-    return nvinfer1::Dims4(shape[0], shape[1], 1, 1);
+    nvinfer1::Dims dims;
+    dims.nbDims = shape.size();
+    for (size_t i = 0; i < shape.size(); i++) {
+      dims.d[i] = shape[i];
+    }
+    return dims;
   }
 }
 }  // NOLINT
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index e321c88af4170acac01d55a5709c65e59561999d..b538d248815727d75586993a1c1fa394f579038e 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller {
 #if IS_TRT_VERSION_GE(5130)
     teller_set.insert("relu6");
     teller_set.insert("hard_sigmoid");
+    int8_teller_set.insert("relu6");
+    int8_teller_set.insert("hard_sigmoid");
 #endif
 #if IS_TRT_VERSION_GE(6000)
     teller_set.insert("fused_embedding_eltwise_layernorm");
@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller {
                                                   "elementwise_add",
                                                   "leaky_relu",
                                                   "fc",
-                                                  "relu6",
                                                   "concat",
                                                   "scale",
                                                   "elementwise_mul",
-                                                  "conv2d_transpose"};
+                                                  "conv2d_transpose",
+                                                  "hard_swish"};
   std::unordered_set<std::string> teller_set{
       "matmul",
       "conv2d",
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index e7f9381e97137d77d27b54cac910bfee9f629464..5e43be90de3dbbfef3c7d3def7e37904bb644380 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
   return ret;
 }
 
+template <typename T>
+void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
+  for (auto ptr : embs_gpu_) {
+    if (ptr) cudaFree(ptr);
+  }
+
+  if (bias_gpu_) cudaFree(bias_gpu_);
+  if (scale_gpu_) cudaFree(scale_gpu_);
+}
+
 template <typename T>
 bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
   int64_t *emb_ptr_gpu_d =
       emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
 
-  std::vector<int64_t> in_ptr, emb_ptr;
+  std::vector<uintptr_t> in_ptr, emb_ptr;
   for (int i = 0; i < input_num; i++) {
     in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
     emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 8ac611cd7c62fddfd4f01d7705b841abc28501d3..5babd87db0602e973452efa613fcaf9810d29afa 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new EmbEltwiseLayernormPluginDynamic(
+    auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
         eps_);
+    ptr->embs_gpu_ = embs_gpu_;
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
+    return ptr;
   }
 
   const char* getPluginType() const override {
@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     return sum_num;
   }
 
+  void terminate() override;
   void serialize(void* buffer) const override {
     // SerializeValue(&buffer, with_fp16_);
     SerializeValue(&buffer, emb_sizes_);
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index f1e11b6fba1f1556e2a8a2aaaca1223aaef76b03..860f1039d5e10290d84d1761bc7337e49fa210eb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
 
 #if IS_TRT_VERSION_GE(6000)
 
+void PReluPluginDynamic::terminate() {
+  if (p_gpu_weight_) {
+    cudaFree(p_gpu_weight_);
+  }
+}
+
 int PReluPluginDynamic::initialize() {
   cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
   cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index 4756ca2e0225795edc3bd3112b21e3b628ad5c0b..3126366c5fdd8bb69a78cea11f5778c45de738ec 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
   }
   ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    ptr->p_gpu_weight_ = p_gpu_weight_;
+    return ptr;
   }
 
   const char* getPluginType() const override { return "prelu_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
+  void terminate() override;
 
   size_t getSerializationSize() const override;
   void serialize(void* buffer) const override;
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 8fe1edc4bf0321b054322a27f0c16819bc023ed8..24cd8e0368182ae597e48765bc0167ca1eca6bd3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new SkipLayerNormPluginDynamic(
+    auto ptr = new SkipLayerNormPluginDynamic(
         bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = bias_gpu_;
+    return ptr;
   }
 
   const char* getPluginType() const override { return "skip_layernorm_plugin"; }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 6bc7728487cc9b00e2a20280c5579483b806bc10..814deda6729278e2e9f9e76ff83bbdd4966821c1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -20,6 +20,12 @@ function(download_int8_data install_dir data_file)
     endif()
 endfunction()
 
+function(download_GRU_data install_dir data_file)
+    if (NOT EXISTS ${install_dir}/${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file})
+    endif()
+endfunction()
+
 function(download_quant_data install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
@@ -97,6 +103,18 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
              --iterations=2)
 endfunction()
 
+function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary infer_model data_path)
+    inference_analysis_test_run(${TARGET_NAME}
+    COMMAND ${test_binary}
+        ARGS --infer_model=${infer_model}
+             --infer_data=${data_path}
+             --batch_size=50
+             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+             --with_accuracy_layer=true
+             --use_analysis=true
+             --iterations=2)
+endfunction()
+
 function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
 	py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
 	        ARGS --data_dir=${data_dir}
@@ -114,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -174,6 +193,8 @@ inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
 
+set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
@@ -315,6 +336,20 @@ if(WITH_MKLDNN)
   download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
   inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
 
+  ### Lexcial analysis GRU model
+  set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
+  set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
+  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
+  set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
+  set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
+
+  # build test binary to be used in subsequent tests
+  inference_analysis_api_test_build(${LEXICAL_TEST_APP} ${LEXICAL_TEST_APP_SRC})
+  # run lexcial analysis test
+  inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
+
   ### optimized FP32 vs. Quant INT8 tests
   
   set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
@@ -439,19 +474,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
-    inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
-    set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
-    endif()
-
-    inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
-
 endif()
 
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index c60e0a25f28c01c453276a8ef04eb79b35b7dda2..da0c93d21b7852e06b6805230078540063c2b243 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) {
   PD_SwitchIrOptim(config, true);
   bool ir_optim = PD_IrOptim(config);
   CHECK(ir_optim) << "NO";
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  CHECK(!bfloat16_enable) << "NO";
   PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
                           false);
   bool trt_enable = PD_TensorrtEngineEnabled(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index 93fcb43447d01dcafa10d8c85234d243d5095d4e..e24706691ed834ac4f49d924162035ec565d24ea 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   PD_EnableMkldnnQuantizer(config);
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
   CHECK(quantizer_enable) << "NO";
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  CHECK(bfloat16_enable) << "NO";
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4035c803413794f8b3da8026d373aa8847054f3
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -0,0 +1,225 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+// setting iterations to 0 means processing the whole dataset
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void SetNativeConfig(AnalysisConfig *cfg,
+                     const int &num_threads = FLAGS_cpu_num_threads) {
+  cfg->SwitchIrOptim(false);
+  cfg->DisableGpu();
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->SetCpuMathLibraryNumThreads(num_threads);
+}
+
+void SetAnalysisConfig(AnalysisConfig *cfg,
+                       const int &num_threads = FLAGS_cpu_num_threads) {
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim(true);
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(num_threads);
+  cfg->EnableMKLDNN();
+}
+
+std::vector<size_t> ReadSentenceLod(std::ifstream &file, size_t offset,
+                                    int64_t total_sentences_num) {
+  std::vector<size_t> sentence_lod(total_sentences_num);
+
+  file.clear();
+  file.seekg(offset);
+  file.read(reinterpret_cast<char *>(sentence_lod.data()),
+            total_sentences_num * sizeof(size_t));
+
+  if (file.eof()) LOG(ERROR) << "Reached end of stream";
+  if (file.fail()) throw std::runtime_error("Failed reading file.");
+  return sentence_lod;
+}
+
+template <typename T>
+class TensorReader {
+ public:
+  TensorReader(std::ifstream &file, size_t beginning_offset, std::string name)
+      : file_(file), position_(beginning_offset), name_(name) {}
+
+  PaddleTensor NextBatch(std::vector<int> shape, std::vector<size_t> lod) {
+    int numel =
+        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    PaddleTensor tensor;
+    tensor.name = name_;
+    tensor.shape = shape;
+    tensor.dtype = GetPaddleDType<T>();
+    tensor.data.Resize(numel * sizeof(T));
+    if (lod.empty() == false) {
+      tensor.lod.clear();
+      tensor.lod.push_back(lod);
+    }
+    file_.seekg(position_);
+    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
+    if (file_.fail())
+      throw std::runtime_error(name_ + ": failed reading file.");
+    file_.read(reinterpret_cast<char *>(tensor.data.data()), numel * sizeof(T));
+    position_ = file_.tellg();
+    return tensor;
+  }
+
+ protected:
+  std::ifstream &file_;
+  size_t position_;
+  std::string name_;
+};
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              int32_t batch_size = FLAGS_batch_size) {
+  std::ifstream file(FLAGS_infer_data, std::ios::binary);
+  if (!file) {
+    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
+  }
+
+  int64_t total_sentences_num = 0L;
+  int64_t total_words_num = 0L;
+  file.seekg(0);
+  file.read(reinterpret_cast<char *>(&total_sentences_num), sizeof(int64_t));
+  LOG(INFO) << "Total sentences in file: " << total_sentences_num;
+  file.read(reinterpret_cast<char *>(&total_words_num), sizeof(int64_t));
+  LOG(INFO) << "Total words in file: " << total_words_num;
+  size_t lods_beginning_offset = static_cast<size_t>(file.tellg());
+  auto words_begining_offset =
+      lods_beginning_offset + sizeof(size_t) * total_sentences_num;
+  auto targets_beginning_offset =
+      words_begining_offset + sizeof(int64_t) * total_words_num;
+
+  std::vector<size_t> lod_full =
+      ReadSentenceLod(file, lods_beginning_offset, total_sentences_num);
+
+  size_t lods_sum = std::accumulate(lod_full.begin(), lod_full.end(), 0UL);
+  EXPECT_EQ(lods_sum, static_cast<size_t>(total_words_num));
+
+  TensorReader<int64_t> words_reader(file, words_begining_offset, "words");
+  TensorReader<int64_t> targets_reader(file, targets_beginning_offset,
+                                       "targets");
+  // If FLAGS_iterations is set to 0, run all batches
+  auto iterations_max = total_sentences_num / batch_size;
+  auto iterations = iterations_max;
+  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
+    iterations = FLAGS_iterations;
+  }
+
+  for (auto i = 0; i < iterations; i++) {
+    // Calculate the words num.  Shape=[words_num, 1]
+    std::vector<size_t> batch_lod = {0};
+    size_t num_words = 0L;
+    std::transform(lod_full.begin() + i * FLAGS_batch_size,
+                   lod_full.begin() + (i + 1) * FLAGS_batch_size,
+                   std::back_inserter(batch_lod),
+                   [&num_words](const size_t lodtemp) -> size_t {
+                     num_words += lodtemp;
+                     return num_words;
+                   });
+    auto words_tensor = words_reader.NextBatch(
+        {static_cast<int>(batch_lod[FLAGS_batch_size]), 1}, batch_lod);
+    if (FLAGS_with_accuracy_layer) {
+      auto targets_tensor = targets_reader.NextBatch(
+          {static_cast<int>(batch_lod[FLAGS_batch_size]), 1}, batch_lod);
+      inputs->emplace_back(std::vector<PaddleTensor>{
+          std::move(words_tensor), std::move(targets_tensor)});
+    } else {
+      inputs->emplace_back(std::vector<PaddleTensor>{std::move(words_tensor)});
+    }
+  }
+}
+
+std::vector<double> Lexical_Test(
+    const std::vector<std::vector<PaddleTensor>> &input_slots_all,
+    std::vector<std::vector<PaddleTensor>> *outputs, AnalysisConfig *config,
+    const bool use_analysis) {
+  TestOneThreadPrediction(
+      reinterpret_cast<const PaddlePredictor::Config *>(config),
+      input_slots_all, outputs, FLAGS_use_analysis);
+  std::vector<double> acc_res(3);
+  if (FLAGS_with_accuracy_layer) {
+    EXPECT_GT(outputs->size(), 0UL);
+    EXPECT_EQ(3UL, (*outputs)[0].size());
+    std::vector<int64_t> acc_sum(3);
+    for (size_t i = 0; i < outputs->size(); i++) {
+      for (size_t j = 0; j < 3UL; j++) {
+        acc_sum[j] =
+            acc_sum[j] + *static_cast<int64_t *>((*outputs)[i][j].data.data());
+      }
+    }
+    // nums_infer, nums_label, nums_correct
+    auto precision =
+        acc_sum[0]
+            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[0])
+            : 0;
+    auto recall =
+        acc_sum[1]
+            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[1])
+            : 0;
+    auto f1_score =
+        acc_sum[2]
+            ? static_cast<float>(2 * precision * recall) / (precision + recall)
+            : 0;
+
+    LOG(INFO) << "Precision:  " << std::fixed << std::setw(6)
+              << std::setprecision(5) << precision;
+    LOG(INFO) << "Recall:  " << std::fixed << std::setw(6)
+              << std::setprecision(5) << recall;
+    LOG(INFO) << "F1 score: " << std::fixed << std::setw(6)
+              << std::setprecision(5) << f1_score;
+
+    acc_res = {precision, recall, f1_score};
+    // return acc_res;
+  } else {
+    EXPECT_GT(outputs->size(), 0UL);
+    EXPECT_EQ(outputs[0].size(), 1UL);
+    LOG(INFO) << "No accuracy result. To get accuracy result provide a model "
+                 "with accuracy layers in it and use --with_accuracy_layer "
+                 "option.";
+  }
+  return acc_res;
+}
+
+TEST(Analyzer_lexical_test, Analyzer_lexical_analysis) {
+  AnalysisConfig native_cfg;
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  SetNativeConfig(&native_cfg, FLAGS_cpu_num_threads);
+  std::vector<double> acc_ref(3);
+  acc_ref = Lexical_Test(input_slots_all, &outputs, &native_cfg, false);
+  if (FLAGS_use_analysis) {
+    AnalysisConfig analysis_cfg;
+    SetAnalysisConfig(&analysis_cfg, FLAGS_cpu_num_threads);
+    analysis_cfg.pass_builder()->AppendPass("mkldnn_placement_pass");
+    std::vector<double> acc_analysis(3);
+    acc_analysis = Lexical_Test(input_slots_all, &outputs, &analysis_cfg, true);
+    for (size_t i = 0; i < acc_analysis.size(); i++) {
+      CHECK_LE(std::abs(acc_ref[i] - acc_analysis[i]),
+               FLAGS_quantized_accuracy);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
index 1fc35f86bcbc5b595f43569f14e1d2e2bc63c3ad..51870e804144a002b15750ee8c8fa9f4c0af40dc 100644
--- a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
@@ -18,7 +18,7 @@ For reference, please examine the code of unit test enclosed in [analyzer_int8_i
 
 * ### Create Analysis config
 
-INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
+INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/inference_deployment/inference/native_infer_en.html#a-name-use-analysisconfig-to-manage-inference-configurations-use-analysisconfig-to-manage-inference-configurations-a)
 
 * ### Create quantize config by analysis config
 
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 0aea47ae7fab1be3bafe35af575e9a2bea2d8420..5840a4c42b3b1065410dc1509cf0cee2480bd596 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -66,7 +66,7 @@ TEST(AnalysisPredictor, use_gpu) {
   float* data_o = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); j += 10) {
     EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
-                10e-5);
+                12e-5);
   }
 }
 
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 394c191c4cfd568a0ce21f59befc0bd3b92d00f1..524e08891f4e90d8a322822e26d75689526d30f5 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -122,8 +122,11 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
   config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                 opt_input_shape);
+  AnalysisConfig* config_deser = new AnalysisConfig(config);
+
   std::vector<float> out_data;
-  run(config, &out_data);
+  run(config, &out_data);         // serialize
+  run(*config_deser, &out_data);  // deserialize
   for (size_t i = 0; i < out_data.size(); i++) {
     EXPECT_NEAR(result[i], out_data[i], 1e-6);
   }
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 8c4ada280cce2b47f3a6b3220cec42a8458715d0..17fedc3d3b8bb8451fac76f6c7dec4ac057fd1d2 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -125,7 +125,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   std::vector<float> out_data;
   run(config, &out_data);
   for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+    EXPECT_NEAR(result[i], out_data[i], 1e-5);
   }
 }
 
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 72816f6d0317600ad6bf8ffc4ad31bd1a23d7c30..b9f979f96d4b106642795151fb8e34b025b2caef 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       ${EXTERNAL_PROJECT_NAME}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+      URL                   ${URL}/${FILENAME}
       DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
   )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index bd1908ac65509343530aa57489661637eed72595..9cc7c267454a4dbd4e1f62ec971e4160d6088913 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+elseif(WITH_XPU)
+    set(AllocatorFacadeDeps xpu_info)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 2ab0d69ef806155adbff83e523a1242e51c2c7fc..3213684c140b02e1fa4b846cb0448f9bc9d8f3ee 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,6 +39,9 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 
 DEFINE_int64(
     gpu_allocator_retry_time, 10000,
@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
     switch (strategy) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kThreadLocal: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
  private:
   void InitSystemAllocators() {
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #ifdef PADDLE_WITH_CUDA
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
   class ZeroSizeAllocator : public Allocator {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
     }
     places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::XPUPlace(dev_id));
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 907a266e7b2bcd30e65ca71ab3dbae7f9b110b3b..92e3933a072832fa42520e67f455d3dc90118518 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,6 +29,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+template <>
+void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void *p = nullptr;
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (FLAGS_init_allocated_mem) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+  }
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  VLOG(10) << "  pointer=" << p;
+  return p;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+  return nullptr;
+#endif
+}
+
+template <>
+void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_free(p);
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
+template <>
+size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  printf("Used func return 0 for XPUPlace\n");
+  return 0;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
  private:
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b19f02db1c0ddf17c84536bf5d512bbd823909b2..225b6858cc1f2a5afc9d612958694d0d940e2e7b 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace memory {
 
@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
+#ifdef PADDLE_WITH_XPU
+template <>
+void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device || dev_id != dst_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    void* tmp = malloc(num);
+    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    free(tmp);
+  } else {
+    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 012b16a6a05f3d5fec3636b0a790d4d67334295f..6e8ff52ed4a8846f5f6060e10cfd9bec22308e9e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,12 +86,16 @@ if (WITH_DGC)
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
 endif()
 
+cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
+lod_tensor maxouting unpooling pooling lod_rank_table context_project
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
@@ -111,6 +115,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
 set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 
+cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax)
 cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
@@ -118,7 +123,7 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
 if (WITH_GPU)
     nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
 else()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1ecb9dd26da3b2dc0bd19aceb03f4e14e5481dde..63b3b0f1a3408154a2d1c8aff76a85a95ad044f6 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+
 #include <memory>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
+
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -216,7 +219,7 @@ $$out = \\frac{1}{\\sqrt{x}}$$
 )DOC";
 
 UNUSED constexpr char AbsDoc[] = R"DOC(
-Abs Activation Operator.
+Abs Operator.
 
 $$out = |x|$$
 
@@ -239,6 +242,9 @@ $$out = \\left \\lfloor x \\right \\rfloor$$
 UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Operator. Computes cosine of x element-wise.
 
+Input range is `(-inf, inf)` and output range is `[-1,1]`.
+Return `nan` if input is out of boundary.
+
 $$out = cos(x)$$
 
 )DOC";
@@ -311,13 +317,6 @@ $$out = x^2$$
 
 )DOC";
 
-UNUSED constexpr char SoftplusDoc[] = R"DOC(
-Softplus Activation Operator.
-
-$$out = \ln(1 + e^{x})$$
-
-)DOC";
-
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
 
@@ -331,7 +330,7 @@ class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of acos operator");
     AddOutput("Out", "Output of acos operator");
     AddComment(R"DOC(
-Arccosine Activation Operator.
+Arccosine Operator.
 
 $$out = \cos^{-1}(x)$$
 
@@ -345,7 +344,7 @@ class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of asin operator");
     AddOutput("Out", "Output of asin operator");
     AddComment(R"DOC(
-Arcsine Activation Operator.
+Arcsine Operator.
 
 $$out = \sin^{-1}(x)$$
 
@@ -359,9 +358,9 @@ class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of atan operator");
     AddOutput("Out", "Output of atan operator");
     AddComment(R"DOC(
-Arctanh Activation Operator.
+Arctangent Operator.
 
-$$out = \tanh^{-1}(x)$$
+$$out = \tan^{-1}(x)$$
 
 )DOC");
   }
@@ -390,6 +389,36 @@ $$out = \max(x, \alpha * x)$$
   }
 };
 
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "Input of Softplus operator, an N-D Tensor, with data type "
+             "float32, float64 or float16.");
+    AddOutput(
+        "Out",
+        "Output of Softplus operator, a Tensor with shape same as input.");
+    AddAttr<float>("beta", "The value of beta for Softplus.").SetDefault(1.0f);
+    AddAttr<float>("threshold", "The value of threshold for Softplus.")
+        .SetDefault(20.0f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+:strong:`Softplus Activation Operator`
+
+..  math::
+    out = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) \\
+    \text{For numerical stability, the implementation reverts to the linear function when :}\,x \times \beta > threshold.
+
+)DOC");
+  }
+};
+
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -504,6 +533,9 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("threshold",
                    "The threshold value of Relu6. Default is 6.0. ")
         .SetDefault(6.0f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Relu6 Activation Operator.
 
@@ -663,7 +695,6 @@ REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc);
 REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
-REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
 template <ActBwdOpFwdDeps kDepValue>
@@ -750,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
   }
 };
 
-// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
-// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
+// leaky_relu Grad: dx=dy if x>=0 else alpha * dy
+// leaky_relu GradGrad: ddy=ddx if x>=0 else alpha * ddx
 template <typename T>
 class LeakyReluDoubleGradMaker
     : public ::paddle::framework::SingleGradOpMaker<T> {
@@ -761,8 +792,8 @@ class LeakyReluDoubleGradMaker
  protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("leaky_relu_grad_grad");
-    // input1: Out
-    op->SetInput("Out", this->Input("Out"));
+    // input1: X
+    op->SetInput("X", this->Input("X"));
     // X@GRAD@GRAD: ddx
     op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
     op->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 3aac7ae8a5e8a9e889242b59f42a29af08ad1c46..00a7c063c9155488d117332d5ef3541d16d76bdb 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -388,9 +388,9 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   }
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    out.device(d) = x * (temp1 + temp2);
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
   }
 };
 
@@ -405,9 +405,9 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -975,32 +975,46 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
-// softplus(x) = log(1 + exp(x))
-// When x is a very large positive number, exp(x) may explode to inf,
-// Using trick below for numerical stability
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+// For numerical stability, using the following formula instead of softplus(x) =
+// log(1 + exp(x))
+// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
+// 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+    auto x_beta = static_cast<T>(beta) * x;
+    out.device(d) = (x_beta > static_cast<T>(threshold))
+                        .select(x, (static_cast<T>(1) + x_beta.exp()).log() /
+                                       static_cast<T>(beta));
   }
 };
 
-// d(softplus(x))/dx = exp(x) / (1 + exp(x))
-// For numerical stability:
-// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
-// exp(x - max(x, 0)))
+// For numerical stability, using the following formula instead of
+// d(softplus(x))/dx = 1 / (1 + exp(-x))
+// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
+// = 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    auto x_beta = static_cast<T>(beta) * x;
     dx.device(d) =
-        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+        (x_beta > static_cast<T>(threshold))
+            .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1070,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
 
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
   }
 };
 
@@ -1084,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 =
-        static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>();
-    auto temp2 = (out > static_cast<T>(0)).template cast<T>();
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1116,9 +1134,20 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * static_cast<T>(alpha) * x.exp() *
-                       (x <= static_cast<T>(0)).template cast<T>();
+    auto temp_a_pos = static_cast<T>(alpha > 0);
+    auto temp_a_neg = static_cast<T>(alpha <= 0);
+    auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
+    auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
+
+    // dx = dout, if alpha > 0 and x > 0
+    // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
+    // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
+    // dx = 0, if alpha <= 0 and x <=0
+    dx.device(d) =
+        dout * temp_a_pos * temp_x_pos +
+        dout * static_cast<T>(alpha) * x.exp() * temp_a_pos * temp_x_neg +
+        dout * (static_cast<T>(1) + static_cast<T>(alpha) * x.exp()) *
+            temp_a_neg * temp_x_pos;
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1437,18 +1466,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
       auto* d = dev.eigen_device();
       auto ddx = framework::EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto out = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad"));
+      auto x = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
       auto ddout = framework::EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((out > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) *
-                              (out <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index f7cc513b234e6e440507af28189ac236b71f9d15..d1a3695015abdb9ce13c4f807d1abacdf0af024d 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,10 +28,15 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 struct Linspace<paddle::platform::CPUDeviceContext, T> {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                   const framework::ExecutionContext& ctx) {
     T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
     T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
     for (int i = 0; i < count; ++i) {
       number_data[i] = start + (T)i * slice;
     }
@@ -130,6 +135,10 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         "use_cudnn",
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
         .SetDefault(true);
+    AddAttr<bool>("align_corners",
+                  "(bool, default false) Whether to align the corners of input"
+                  "and ouput.")
+        .SetDefault(true);
     AddAttr<std::vector<int>>(
         "output_shape",
         "The target output image shape with format [N, C, H, W].")
@@ -164,10 +173,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]]]
-        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in
+        width axis.
     
     Step2:
-        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last
+        dimension. The we get:
         C_ = [[-1.  -1.   1. ]
               [-0.5 -1.   1. ]
               [ 0.  -1.   1. ]
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7aaaa0002c5ab31af72c75e69f5a283c09633ba4
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -0,0 +1,211 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+struct Linspace<paddle::platform::CUDADeviceContext, T> {
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
+                  const framework::ExecutionContext& ctx) {
+    T* number_data = numbers->mutable_data<T>({count}, ctx.GetPlace());
+    T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
+    auto stream = ctx.cuda_device_context().stream();
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
+                                                  number_data);
+  }
+};
+
+template <typename T>
+__global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
+                                   T h_start, T w_start, T h_step, T w_step,
+                                   const T* theta,  // N, 2, 3
+                                   T* output) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+
+    int theta_offset = n * 6;  // 2 * 3;
+    // affine from (h_coor, w_coor) to (x, y)
+    output[index * 2] = theta[theta_offset] * h_coor +
+                        theta[theta_offset + 1] * w_coor +
+                        theta[theta_offset + 2];
+    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
+                            theta[theta_offset + 4] * w_coor +
+                            theta[theta_offset + 5];
+  }
+}
+
+template <typename T>
+__global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
+                                        int out_w, T h_start, T w_start,
+                                        T h_step, T w_step,
+                                        const T* out_grad,  // N, H, W, 2
+                                        T* theta_grad) {    // N, 2, 3
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+
+    int theta_offset = n * 6;  // 2 * 3;
+    T out_grad_x = out_grad[index * 2];
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+
+    T out_grad_y = out_grad[index * 2 + 1];
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+  }
+}
+
+template <typename T>
+class AffineGridOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    auto* output = ctx.Output<Tensor>("Output");
+    T* out_data = output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+
+    const int count = n * h * w;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        theta->data<T>(),  // N, 2, 3
+        out_data);
+  }
+};
+
+template <typename T>
+class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
+
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+    const int count = n * h * w;
+    VLOG(3) << "count: " << count << "; h_step: " << h_step
+            << "; w_step: " << w_step << "; h_start: " << h_start
+            << "; w_start: " << w_start;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_grad_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        output_grad->data<T>(), theta_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(affine_grid, ops::AffineGridOpCUDAKernel<float>,
+                        ops::AffineGridOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(affine_grid_grad,
+                        ops::AffineGridGradOpCUDAKernel<float>,
+                        ops::AffineGridGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 73df8a38b96c30196a7e39d2cf1e348f2a7722ec..50c9ebcd9c8f52077d7f5d0abb10c631cbeee794 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -37,12 +37,13 @@ using Array4 = Eigen::DSizes<int64_t, 4>;
  */
 template <typename DeviceContext, typename T>
 struct Linspace {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                   const framework::ExecutionContext& ctx);
 };
 
 template <typename DeviceContext, typename T>
-inline void GetIdxMap(int n, int h, int w, Tensor* grid,
+inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
                       const framework::ExecutionContext& ctx) {
   auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
   grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
@@ -50,16 +51,19 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
   // Get indexes of height with shape [height, width, 1]
   Tensor h_idx;
   Linspace<DeviceContext, T> linspace;
-  linspace((T)-1, (T)1, h, &h_idx, ctx);
+  linspace((T)-1, (T)1, h, align_corners, &h_idx, ctx);
   auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
   // Get indexes of width with shape [height, width, 1]
   Tensor w_idx;
-  linspace((T)-1, (T)1, w, &w_idx, ctx);
+  linspace((T)-1, (T)1, w, align_corners, &w_idx, ctx);
   auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
   // Get constant ones tensor with shape [height, width, 1]
   Tensor ones;
   ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+
+  math::SetConstant<DeviceContext, T>()(
+      ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
+  auto ones_t = EigenTensor<T, 3>::From(ones);
   // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
   // ones
   Tensor w_idx_map;
@@ -74,11 +78,9 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
   Tensor w_h_one_idx_map;
   w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
   auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
-
   w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
                                   .broadcast(Array2(h, 1))
                                   .reshape(Array3(h, w, 1));
-
   h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
                                   .broadcast(Array2(w, 1))
                                   .shuffle(Array2(1, 0))
@@ -97,6 +99,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     auto* theta = ctx.Input<Tensor>("Theta");
     int n = theta->dims()[0];
     auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
     int h = 0;
     int w = 0;
     if (size_attr.size() == 0) {
@@ -116,7 +119,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
     Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
     // output = grid * theta.T
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
@@ -140,6 +143,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
     auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
     int n = output_grad->dims()[0];
     auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
     int h = 0;
     int w = 0;
     if (size_attr.size() == 0) {
@@ -158,7 +162,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<DeviceContext>(), theta_grad,
         static_cast<T>(0));
     Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
     // output = grid * theta.T
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 911757007266c9ff88b0e348d350909ce0ff0bce..736483c3304ac32491de4fd98879fbfef04f7110 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -22,9 +22,11 @@ namespace operators {
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Input", "The first input tensor to compare.");
-    AddInput("Other", "The second input tensor to compare.");
-    AddOutput("Out", "The output tensor of allclose op.");
+    AddInput("Input",
+             "The input tensor, it's data type should be float32, float64.");
+    AddInput("Other",
+             "The input tensor, it's data type should be float32, float64.");
+    AddOutput("Out", "The output tensor, it's data type is bool.");
 
     AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
         .SetDefault(1e-5);
@@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
 
     AddComment(R"DOC( 
-This operator checks if all :math:`input` and :math:`other` satisfy the condition:
+This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
-:math:`\left| input - other \right| \leq atol + rtol \times \left| other \right|`
+.. math::
+    \left| x - y \right| \leq atol + rtol \times \left| y \right|
 
-elementwise, for all elements of :math:`input` and :math:`other`. The behaviour of this
+elementwise, for all elements of :math:`x` and :math:`y`. The behaviour of this
 operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if
 two tensors are elementwise equal within a tolerance.
 )DOC");
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
index 01b6ccedcdd8156269d445d7822a4184c062b225..7f0ca1493f712f7f4809a56bf6a23f8757f94c2d 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
+
 #include <string>
 #include <vector>
 
@@ -67,7 +68,7 @@ class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
               "amp_check_finite_and_unscale operator.")
         .AsDuplicable();
     AddOutput("FoundInfinite",
-              "(Tensor) 1-dim tensor, contains a int scalar, which indicates "
+              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
               "if there there is infinite or nan item in input X.");
     AddComment(R"DOC(
 amp_check_finite_and_scale operator.
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
index b92c6901d71bd80c45b0681f62a1a2ddedfcf64a..ee00d7c5f4499867c2c706ddcf314c1bfae0a866 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda.h>
+
 #include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -21,7 +22,7 @@ namespace operators {
 
 template <typename T>
 __global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
-                                       int* found_inf, T* out) {
+                                       bool* found_inf, T* out) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (idx < num) {
@@ -44,7 +45,7 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
     auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
 
     const T* scale_data = scale->data<T>();
-    int* found_inf_data = found_inf->mutable_data<int>(dev_ctx.GetPlace());
+    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
     cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
 
     for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index 85e4f98173511435a52b32e506afc8d5b772f74f..14708c4df10f5160d0e72e7669e0015554d8215f 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -1,29 +1,22 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..73581dac4e419ca9c970db4414ff54d4cbd3fd70
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __NVCC__
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+using Tensor = framework::Tensor;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer, const T init, const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
+                    Tensor* indices, const int64_t pre, const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256)
+      return 512;
+    else if (col > 128)
+      return 256;
+    else if (col > 64)
+      return 128;
+    else if (col > 32)
+      return 64;
+    else if (col > 16)
+      return 32;
+    else if (col > 8)
+      return 16;
+    else
+      return 8;
+  };
+
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
+              in_data, out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::max(),
+              in_data, out_data));
+    }
+  }
+}
+
+template <typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const framework::ExecutionContext& ctx;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {}
+  template <typename IndType>
+  void apply() const {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int axis = ctx.Attr<int64_t>("axis");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    framework::DDim input_dims;
+    if (flatten) {
+      input_dims = framework::make_ddim({input->numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      input_dims = input->dims();
+      if (axis < 0) axis += input->dims().size();
+    }
+
+    int64_t numel = input->numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+
+    const auto& dev_ctx = ctx.cuda_device_context();
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
+  }
+};
+template <typename T, class Reducer>
+class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dtype = ctx.Attr<int>("dtype");
+    if (dtype < 0) {
+      framework::VisitDataType(static_cast<framework::proto::VarType::Type>(
+                                   framework::proto::VarType::INT64),
+                               VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+      return;
+    }
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(dtype),
+        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+  }
+};
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 0fc7b47c62ea9d7da805b797fcf5e4db4e39328d..ae3637f6f99783d70bd57a3935a979b0387692de 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {};
   struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
                           enum_argminmax_value> {                             \
     void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, int64_t axis, bool keepdims) { \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+                    framework::LoDTensor* out, framework::DDim x_dims,        \
+                    int64_t axis, bool keepdims) {                            \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
       if (keepdims) {                                                         \
         auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
         out_eigen.device(*(ctx.eigen_device())) =                             \
@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor {
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto keepdims = ctx.Attr<bool>("keepdims");
-    auto x_rank = x.dims().size();
-    if (axis < 0) axis += x_rank;
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    // if flatten, will construct the new dims for the cacluate
+    framework::DDim x_dims;
+    if (flatten) {
+      x_dims = framework::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) axis += x_dims.size();
+    }
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
   ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
       functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, axis, keepdims)
+  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
 
-    switch (x.dims().size()) {
+    switch (x_dims.size()) {
       case 1:
         CALL_ARG_MINMAX_FUNCTOR(1);
         break;
@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
     const auto& x_dims = ctx->GetInputDim("X");
     int64_t axis = ctx->Attrs().Get<int64_t>("axis");
     bool keepdims = ctx->Attrs().Get<bool>("keepdims");
+    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
 
     PADDLE_ENFORCE_GE(axis, -x_dims.size(),
                       platform::errors::InvalidArgument(
@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
     std::vector<int64_t> vec;
-    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
-    if (keepdims) {
-      vec.push_back(static_cast<int64_t>(1));
+    if (flatten) {
+      // if is flatten, will return the only on element
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+    } else {
+      auto x_rank = x_dims.size();
+      if (axis < 0) axis += x_rank;
+      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
     }
-    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
     ctx->SetOutputDim("Out", framework::make_ddim(vec));
   }
 };
@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
     AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
     AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Flatten the input value, and search the min or max indices")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
       %s Operator.
 
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index 47d7c8b12243c6c5c501188af7f48f125c266009..23170bf0087906d752767051ce58874cb3584ee5 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -1,29 +1,21 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
+REGISTER_OP_CUDA_KERNEL(
+    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 50797a100b1a67244b7c7b40b47404b60dc6af65..f56789b889526301e670ac37d1d6131aaafb070a 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -32,22 +32,29 @@ class BCELossOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), label_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Label) shall have the same shape."));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    auto labels_dims = ctx->GetInputDim("Label");
+
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same rank."
+                          "But received: the rank of Input(X) is [%d], "
+                          "the rank of Input(Label) is [%d].",
+                          rank, labels_dims.size()));
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
     if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims.size(), label_dims.size(),
-          platform::errors::InvalidArgument(
-              "ShapeError: Input(X) and Input(Label) shall have the same shape "
-              "But received: the shape of Input(X) is [%s], the shape of "
-              "Input(Label) is [%s].",
-              x_dims, label_dims));
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
     }
 
     ctx->ShareDim("X", "Out");
@@ -76,20 +83,31 @@ class BCELossGradOp : public framework::OperatorWithKernel {
                    framework::GradVarName("X"), "BCELossGrad");
 
     auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(dout_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
     if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
+
       PADDLE_ENFORCE_EQ(x_dims, dout_dims,
                         platform::errors::InvalidArgument(
-                            "ShapeError:The Input(X) and Input(Out@Grad) "
-                            "should have the same "
-                            "shape, But received: the shape of Input(X) is "
-                            "[%s], the shape of "
-                            "Input(Out@GRAD) is [%s].",
+                            "Input(X) and Input(Out@Grad) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Out@Grad) is [%s].",
                             x_dims, dout_dims));
     }
+
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 8e30f4eb15b6afde885512206c7eaeb721cdd44b..16db4f05e31d365d8d06174ab708e30474b8a8c2 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -67,7 +67,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
 
     auto x_data = x->data<T>();
     auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
+
     platform::GpuLaunchConfig config =
         platform::getGpuLaunchConfig(x_numel, ctx);
 
@@ -75,7 +76,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
     T* x_cpu_data = x_cpu.data<T>();
 
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
       PADDLE_ENFORCE_GE(
           x_cpu_data[i], static_cast<T>(0),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h
index 85e120e4642a298ebff00fc0e4b6425f775443aa..dd87b69efe2869727f2db778cec44612efbcff6b 100644
--- a/paddle/fluid/operators/bce_loss_op.h
+++ b/paddle/fluid/operators/bce_loss_op.h
@@ -34,11 +34,11 @@ class BCELossOpKernel : public framework::OpKernel<T> {
     auto x_data = x->data<T>();
     auto label_data = labels->data<T>();
     auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
 
     // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
     // x) - label * ln(x)
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
       PADDLE_ENFORCE_GE(
           x_data[i], static_cast<T>(0),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/bernoulli_op.cc b/paddle/fluid/operators/bernoulli_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c525da5953d76d4406fbdd0d9d6e98619e409f71
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/bernoulli_op.h"
+
+#include <algorithm>
+#include <string>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+namespace paddle {
+namespace operators {
+
+class BernoulliOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A tensor with probabilities for generating the random binary "
+             "number");
+    AddOutput("Out", "A Tensor filled with random binary number");
+    AddComment(R"DOC(
+This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+
+    Out ~ Bernoulli(X)
+
+)DOC");
+  }
+};
+
+class BernoulliOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    return UnaryOpUnchangedInferShape(ctx);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class BernoulliOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto *in_data = x->data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+    std::uniform_real_distribution<T> dist(0.0, 1.0);
+    auto gen_ptr = framework::Generator::GetInstance();
+    std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
+    }
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+    bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(bernoulli,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0837071d456068f64ebc74b115f1a7904eba41c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -0,0 +1,72 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/bernoulli_op.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+// it can be consistent with cpu when CUDAGenerator is provided.
+template <typename T>
+struct BernoulliCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return static_cast<T>(dist(rng) < p);
+  }
+};
+
+template <typename T>
+class BernoulliOpKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::random_device rd;
+    auto seed = rd();
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto* in_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    platform::Transform<platform::CUDADeviceContext> trans;
+    auto* context =
+        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
+    trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
+          out_data, BernoulliCudaFunctor<T>(seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    bernoulli, ops::BernoulliOpKernel<plat::CUDADeviceContext, float>,
+    ops::BernoulliOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..06a83ada17bb926d6f7d4eef10750986d00f048c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Samples a bernoulli distribution given a probability input
+ */
+
+template <typename T>
+inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
+  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
+                              "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
+                              "The probability should be >= 1, but got %f", p));
+  return static_cast<T>(rand < p);
+}
+
+template <typename DeviceContext, typename T>
+class BernoulliOpKernel;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
index c44299686516e968692fe146a5c324c7f1fa83d2..530147609fe1e47320a1cbd9223ccdfb82ba7e7a 100644
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -63,7 +63,6 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
       for_range(matrix_band_part_functor);
     }
 
-    // TODO(guosheng): Add callback to check info
     auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
     auto* info_ptr = reinterpret_cast<int*>(info->ptr());
 
@@ -96,6 +95,20 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
     }
 #endif
+    // check the info
+    std::vector<int> error_info;  // only for checking positive matrix
+    error_info.resize(batch_count);
+
+    memory::Copy(platform::CPUPlace(), error_info.data(),
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
+
+    for (int i = 0; i < batch_count; ++i) {
+      PADDLE_ENFORCE_EQ(error_info[i], 0,
+                        platform::errors::PreconditionNotMet(
+                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
+                            error_info[i], error_info[i]));
+    }
   }
 
   void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h
index b0280b00ecf447d36b199e6b6765fa7928e081f0..15dd8315362ed0221c5c8b9c523af37da38dfd7e 100644
--- a/paddle/fluid/operators/cholesky_op.h
+++ b/paddle/fluid/operators/cholesky_op.h
@@ -59,22 +59,24 @@ class CholeskyCPUKernel : public framework::OpKernel<T> {
             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
             Eigen::UpLoType::Upper>
             llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
-            llt_decomposition.info(), Eigen::Success,
-            platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
-                "might not be not be positive definite."));
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
+                          platform::errors::InvalidArgument(
+                              "Cholesky decomposition was not successful. The "
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
         output = llt_decomposition.matrixU();
       } else {
         Eigen::LLT<
             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
             Eigen::UpLoType::Lower>
             llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
-            llt_decomposition.info(), Eigen::Success,
-            platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
-                "might not be not be positive definite."));
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
+                          platform::errors::InvalidArgument(
+                              "Cholesky decomposition was not successful. The "
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
         output = llt_decomposition.matrixL();
       }
     }
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index a8485a148b17c1a084b9d294c998531ec3a8e071..03abfe7eb703b021dac2261dcd9c87d440b04001 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = static_cast<T>(context.Attr<float>("max"));
+    auto max = context.Attr<T>("max");
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -77,9 +77,8 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
-    max = static_cast<T>(max);
 
-    auto min = context.Attr<float>("min");
+    auto min = context.Attr<T>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -90,11 +89,12 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
-    min = static_cast<T>(min);
-    PADDLE_ENFORCE_LT(min, max, platform::errors::InvalidArgument(
-                                    "max should be greater than min. "
-                                    "But received min = %f, max = %f",
-                                    min, max));
+
+    PADDLE_ENFORCE_LE(min, max,
+                      platform::errors::InvalidArgument(
+                          "max should be greater than or equal to min. "
+                          "But received min = %f, max = %f",
+                          min, max));
 
     auto* x_var = context.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
@@ -141,7 +141,7 @@ template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = static_cast<T>(context.Attr<float>("max"));
+    auto max = context.Attr<T>("max");
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -152,9 +152,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
-    max = static_cast<T>(max);
 
-    auto min = context.Attr<float>("min");
+    auto min = context.Attr<T>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -165,7 +164,6 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
-    min = static_cast<T>(min);
 
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3f9423ae5c26433084eb040a4c38b14feabfe89e..686b3039d4dea93042463250e17f7558a318377c 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -35,5 +35,9 @@ if(WITH_NCCL)
     op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
 endif()
 
+if(WITH_GLOO)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
diff --git a/paddle/fluid/operators/collective/barrier_op.cc b/paddle/fluid/operators/collective/barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f154a42e2be8f825e6b4a386c0c262f31b0edda
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class BarrierOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+class BarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Input data (only used in CUDAKernel).");
+    AddOutput("Out", "(Tensor) Output data (only used in CUDAKernel).");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Barrier Operator - Barrier among all pariticapitors.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(barrier, ops::BarrierOp, ops::BarrierOpMaker);
+REGISTER_OP_CPU_KERNEL(barrier, ops::BarrierOpCPUKernel<int>);
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cad7bda63046fb5c980f9dc1c902d51f51f759
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    ncclRedOp_t nccl_red_type = ncclSum;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
+    auto comm_stream =
+        platform::NCCLCommContext::Instance().Get(rid, place)->stream();
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with NCCL."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(barrier, ops::BarrierOpCUDAKernel<int>);
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a195a43540669921e99ab1d0f6163179ba0455
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/barrier.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::BarrierOptions opts(gloo->GetContext());
+    gloo::barrier(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index fe99a9e128d1892a093c090f33f065ae2a158056..ec55a14d085e5ef23074b4924b453a74e7478216 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -23,6 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allgather.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -30,7 +35,31 @@ template <typename T>
 class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    framework::DDim out_dims = in->dims();
+    auto place = ctx.GetPlace();
+
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    auto nranks = gloo->Size();
+    out_dims[0] *= nranks;
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(out_dims, place);
+
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::AllgatherOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel * nranks);
+    gloo::allgather(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 096a2f6a0959768bcb99d87b0d42edf71d98f481..be518b3bf0a397d7c20f6f1f03878987a43e84dc 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allreduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -50,7 +55,53 @@ template <ReduceType red_type, typename T>
 class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("CAllReduce op do not support CPUKernel for now.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::AllreduceOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel);
+    switch (red_type) {
+      case kRedSum:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::sum<T>));
+        break;
+      case kRedMax:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::max<T>));
+        break;
+      case kRedMin:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::min<T>));
+        break;
+      case kRedProd:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::product<T>));
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "Invalid reduce type: %d.", red_type));
+    }
+    gloo::allreduce(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index 4ceb0aa835fe116cdc14444dfb7ea6046f33c482..eb4acb9a369fc7bfa8e23b5544f54c955d4a87b6 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -22,6 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/broadcast.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -29,7 +34,27 @@ template <typename T>
 class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root = ctx.Attr<int>("root");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::BroadcastOptions opts(gloo->GetContext());
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root);
+    gloo::broadcast(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index b460dc40ede31cc66e46e9ada1d4b8b5ed9dba8c..a3bf1f4dfb1cb09fc864f891dda793ecde9027c6 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -52,10 +52,12 @@ class CCommInitOp : public framework::OperatorBase {
     int nranks = Attr<int>("nranks");
     int rank_id = Attr<int>("rank");
     int rid = Attr<int>("ring_id");
-
+    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
+    }
     platform::NCCLCommContext::Instance().CreateNCCLComm(
-        nccl_id, nranks, rank_id,
-        BOOST_GET_CONST(platform::CUDAPlace, place).device, rid);
+        nccl_id, nranks, rank_id, device_id, rid);
 #else
     PADDLE_THROW("PaddlePaddle should compile with GPU.");
 #endif
@@ -74,6 +76,11 @@ Initialize collective communicatoin context within this trainer
     AddAttr<int>("nranks", "(int) The number of ranks of distributed trainers");
     AddAttr<int>("rank",
                  "(int) The rank of the trainer in distributed training.");
+    AddAttr<int>("device_id",
+                 "(int) The deivce_id on which to initialize the communicator."
+                 "Now, you only have to set this attr manually for pipeline "
+                 "training. Otherwise, make it as default.")
+        .SetDefault(-1);
     AddAttr<int>("ring_id", "(int default 0) user specified ring id")
         .SetDefault(0);
   }
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..425351877689f7e3ad8e0a46d2226f5f751a4016
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceMaxOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max, ops::CReduceOp,
+                             ops::CReduceMaxOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e260346b4bdd8aced0df59c72f5adb4c479e8d0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_max,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e849641e639eeceb48fc95656b269988c827006
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceMinOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min, ops::CReduceOp,
+                             ops::CReduceMinOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77a75ed0b7af2a7946c02bfa0f33038aa0090c5b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_min,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81dc5c35bf14e569fe90743c1dc62a61fd5655ba
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/reduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
+
+class CReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root_id = ctx.Attr<int>("root_id");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::ReduceOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root_id);
+    switch (red_type) {
+      case kRedSum:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::sum<T>));
+        break;
+      case kRedMax:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::max<T>));
+        break;
+      case kRedMin:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::min<T>));
+        break;
+      case kRedProd:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::product<T>));
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "Invalid reduce type: %d.", red_type));
+    }
+    gloo::reduce(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    ncclRedOp_t nccl_red_type = ncclSum;
+    switch (red_type) {
+      case kRedSum:
+        nccl_red_type = ncclSum;
+        break;
+
+      case kRedMax:
+        nccl_red_type = ncclMax;
+        break;
+
+      case kRedMin:
+        nccl_red_type = ncclMin;
+        break;
+
+      case kRedProd:
+        nccl_red_type = ncclProd;
+        break;
+
+      default:
+        PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
+                                           "red_type must be one of kRedSum, "
+                                           "kRedMax, kRedMin, kRedProd."));
+    }
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
+        stream));
+#else
+    PADDLE_ENFORCE_EQ(true, false,
+                      platform::errors::Unavailable(
+                          "PaddlePaddle should compile with GPU.."));
+#endif
+  }
+};
+
+class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be reduced.");
+    AddOutput("Out", "(Tensor) the reduced result.");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+CReduce %s Operator
+
+Call collective Reduce with reduce type %s. If input and output are
+the same variable, in-place reduce will be used.
+)DOC",
+                               GetName(), GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64935df856ec79f427bdcd21e03b7c493c31ac1e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceProdOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Prod"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod, ops::CReduceOp,
+                             ops::CReduceProdOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07e431f7bc838caa9bc3abdcd0be1beb94b96635
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_prod,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e20cee7e186a462aedc1881c6e34cacc8d09de0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceSumOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Sum"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum, ops::CReduceOp,
+                             ops::CReduceSumOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9826422c16cb67f9f7101643918a83898c606b3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_sum,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cc b/paddle/fluid/operators/collective/c_scatter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..908708e6e328f54466d4bb69b30fd607e14d1fe9
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CScatter");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CScatter");
+    int root_id = ctx->Attrs().Get<int>("root");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) must be greater than 1 "
+                          "to use collective op (c_scatter op).",
+                          nranks));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[0] = dim[0] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be broadcasted.");
+    AddOutput("Out", "(Tensor) the result of broadcast.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
+        .SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+CScatter Operator
+Scatter the source to all participators.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_scatter, ops::CScatterOp, ops::CScatterOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_scatter, ops::CScatterOpCPUKernel<float>,
+                       ops::CScatterOpCPUKernel<double>,
+                       ops::CScatterOpCPUKernel<int>,
+                       ops::CScatterOpCPUKernel<int64_t>,
+                       ops::CScatterOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d9e6b4b7d99044f584e9e21062a786252d60f76
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int root_id = ctx.Attr<int>("root");
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    PADDLE_ENFORCE_EQ(nranks, comm->nranks(),
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) you set of must "
+                          "be equal to comm->nranks (%d).",
+                          nranks, comm->nranks()));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            ring_id));
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    framework::DDim x_dims = x->dims();
+    framework::DDim out_dims(x_dims);
+    framework::Tensor temp;
+    auto out_ptr = temp.mutable_data<T>(out_dims, place);
+    if (root_id == comm->rank()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
+          root_id, comm->comm(), stream));
+
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(&temp));
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          out_ptr, numel, dtype, root_id, comm->comm(), stream));
+    }
+
+    out_dims[0] = out_dims[0] / nranks;
+    auto start_index = out_dims[0] * comm->rank();
+    auto end_index = start_index + out_dims[0];
+    temp = temp.Slice(start_index, end_index);
+    temp.Resize(out_dims);
+    out->mutable_data<T>(out_dims, place);
+    framework::TensorCopySync(*static_cast<const framework::Tensor*>(&temp),
+                              place, static_cast<framework::Tensor*>(out));
+    out->Resize(out_dims);
+#else
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_scatter, ops::CScatterOpCUDAKernel<float>,
+                        ops::CScatterOpCUDAKernel<double>,
+                        ops::CScatterOpCUDAKernel<int>,
+                        ops::CScatterOpCUDAKernel<int64_t>,
+                        ops::CScatterOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..71a5f488ebc11a93cece9b85f6af288a4662b2d8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/scatter.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CScatterOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root_id = ctx.Attr<int>("root");
+
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+
+    int64_t send_numel = out->numel();
+    auto nranks = gloo->Size();
+    auto rank = gloo->Rank();
+    T* recv_buff = out->data<T>();
+    gloo::ScatterOptions opts(gloo->GetContext());
+    if (root_id == rank) {
+      T* send_buff = const_cast<T*>(in->data<T>());
+      std::vector<T*> ptrs(nranks);
+      for (int i = 0; i < nranks; ++i) {
+        ptrs[i] = send_buff;
+        send_buff += send_numel;
+      }
+      opts.setInputs(ptrs, send_numel);
+    }
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root_id);
+
+    gloo::scatter(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22b212fc1b9f8844f0ae3555ac6d63af1f48d1cd
--- /dev/null
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+#include <algorithm>
+#include <vector>
+
+// This file almostly contains all the infershape functions that are used in
+// operators.
+
+namespace paddle {
+namespace operators {
+namespace details {
+inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
+                                   const framework::DDim &y_dims,
+                                   int *x_dims_array, int *y_dims_array,
+                                   int *out_dims_array, const int max_dim,
+                                   const int axis) {
+  PADDLE_ENFORCE_GE(
+      axis, 0,
+      platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis, max_dim,
+                    platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim, axis));
+  if (x_dims.size() > y_dims.size()) {
+    std::fill(y_dims_array, y_dims_array + axis, 1);
+    if (axis + y_dims.size() < max_dim) {
+      std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1);
+    }
+    std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array);
+    std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array + axis);
+  } else {
+    std::fill(x_dims_array, x_dims_array + axis, 1);
+    if (axis + x_dims.size() < max_dim) {
+      std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1);
+    }
+    std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis);
+    std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array);
+  }
+
+  for (int i = 0; i < max_dim; i++) {
+    PADDLE_ENFORCE_EQ(
+        x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
+            y_dims_array[i] <= 1,
+        true, platform::errors::InvalidArgument(
+                  "Broadcast dimension mismatch. Operands could "
+                  "not be broadcast together with the shape of X = [%s] and "
+                  "the shape of Y = [%s]. Received [%d] in X is not equal to "
+                  "[%d] in Y at i:%d.",
+                  x_dims, y_dims, x_dims_array[i], y_dims_array[i], i));
+    if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
+        (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
+      out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+    } else {
+      out_dims_array[i] = -1;
+    }
+  }
+}
+}  // namespace details
+
+// shape input(0) -> output(0) without change.
+void UnaryOpUnchangedInferShape(framework::InferShapeContext *ctx) {
+  auto x_name = ctx->GetInputNameByIdx(0);
+  auto out_name = ctx->GetOutputNameByIdx(0);
+  ctx->ShareDim(x_name, /*->*/ out_name);
+  ctx->ShareLoD(x_name, /*->*/ out_name);
+}
+
+// shape input(0) -> output(0) without change, check if axis in range [-Rank(x),
+// Rank(x)-1]
+void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext *ctx) {
+  auto x_name = ctx->GetInputNameByIdx(0);
+  auto out_name = ctx->GetOutputNameByIdx(0);
+  auto x_dim = ctx->GetInputDim(x_name);
+  auto x_rank = x_dim.size();
+  auto axis = ctx->Attrs().Get<int>("axis");
+  PADDLE_ENFORCE_GE(
+      axis, -x_rank,
+      platform::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis, x_rank));
+  PADDLE_ENFORCE_LT(
+      axis, x_rank,
+      platform::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis, x_rank));
+  ctx->ShareDim(x_name, /*->*/ out_name);
+  ctx->ShareLoD(x_name, /*->*/ out_name);
+}
+
+// broadcast input(0) and input(1) -> output(0)
+void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
+  auto x_name = ctx->GetInputNameByIdx(0);
+  auto y_name = ctx->GetInputNameByIdx(1);
+  auto out_name = ctx->GetOutputNameByIdx(0);
+  auto x_dims = ctx->GetInputDim(x_name);
+  auto y_dims = ctx->GetInputDim(y_name);
+  PADDLE_ENFORCE_EQ(
+      ctx->GetInputsVarType(y_name).front(),
+      framework::proto::VarType::LOD_TENSOR,
+      platform::errors::InvalidArgument(
+          "The var type of input %s should be LoDTensor, but got %s.",
+          ctx->Inputs(y_name).front(), ctx->GetInputsVarType(y_name).front()));
+
+  if (ctx->GetInputsVarType(x_name).front() ==
+      framework::proto::VarType::SELECTED_ROWS) {
+    PADDLE_ENFORCE_EQ(y_dims.size(), 1u,
+                      platform::errors::InvalidArgument(
+                          "For binary broadcastable operator, if X is "
+                          "Sparse(VarType.SELECTED_ROWS"
+                          "), Y must be scalar, and the size of Y should be 1. "
+                          "But reveived the size of Y = %s.",
+                          y_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        y_dims[0], 1,
+        platform::errors::InvalidArgument(
+            "For binary broadcastable operator, if X is "
+            "Sparse(VarType.SELECTED_ROWS"
+            "), Y must be scalar, the first dimension of Y should be 1. "
+            "But reveived the first dimension of Y = %s.",
+            y_dims[0]));
+  } else if (ctx->GetInputsVarType(x_name).front() !=
+             framework::proto::VarType::LOD_TENSOR) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "For binary broadcastable operator, the var type of input X should "
+        "be LOD_TENSOR, but got %s",
+        ctx->GetInputsVarType(x_name).front()));
+  }
+
+  if (x_dims == y_dims) {
+    ctx->ShareDim(x_name, /*->*/ out_name);
+    ctx->ShareLoD(x_name, /*->*/ out_name);
+  } else {
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    int axis = ctx->Attrs().Get<int>("axis");
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    details::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                                    y_dims_array.data(), out_dims_array.data(),
+                                    max_dim, axis);
+    ctx->SetOutputDim(out_name, framework::make_ddim(out_dims_array));
+    ctx->ShareLoD(x_name, /*->*/ out_name);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/common_infer_shape_functions.h b/paddle/fluid/operators/common_infer_shape_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cb9eab2865ce068a4f776bc63070c59bf029481
--- /dev/null
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+// This file almostly contains all the infershape functions that are used in
+// operators.
+
+namespace paddle {
+namespace operators {
+
+// shape input(0) -> output(0) without change.
+void UnaryOpUnchangedInferShape(framework::InferShapeContext* ctx);
+// shape input(0) -> output(0) without change, check if axis in range [-Rank(x),
+// Rank(x)-1]
+void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext* ctx);
+// broadcast input(0) and input(1) -> output(0)
+void BinaryOpBroadcastInferShape(framework::InferShapeContext* ctx);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 74589dcb6a74c79299ef682de0bce146f33ec261..fb8cde70f5324f42fbc05fdfd65b548e0e58206a 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/logical_op.h"
+#include <algorithm>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -97,19 +99,19 @@ class BinaryLogicalOp : public LogicalOp {
     OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-
-    int product_x = framework::product(dim_x);
-    int product_y = framework::product(dim_y);
-    bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0);
-    if (check) {
-      PADDLE_ENFORCE_EQ(product_x, product_y,
-                        platform::errors::InvalidArgument(
-                            "The number of elements in X and Y should be same, "
-                            "but received %d != %d",
-                            product_x, product_y));
+    if (dim_x == dim_y) {
+      context->SetOutputDim("Out", dim_x);
+    } else {
+      int max_dim = std::max(dim_x.size(), dim_y.size());
+      int axis = std::abs(dim_x.size() - dim_y.size());
+      std::vector<int> x_dims_array(max_dim);
+      std::vector<int> y_dims_array(max_dim);
+      std::vector<int> out_dims_array(max_dim);
+      GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
+                             y_dims_array.data(), out_dims_array.data(),
+                             max_dim, axis);
+      context->SetOutputDim("Out", framework::make_ddim(out_dims_array));
     }
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
   }
 };
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
index 4a83e0fda6e4ecdb1112f096eb37159337c37147..2c39201a426a25bb8595f415d80192080f1f1931 100644
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -57,10 +58,8 @@ class BinaryLogicalOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), y->data<T>(),
-          out->mutable_data<bool>(context.GetPlace()), binary_func);
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
+                                                          binary_func, out);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index a8c4107add1beeb9a7a5aedad9be982b6d8b6aac..9ed169fe3502e0c34b9f37d6520edc1a3fbfa91c 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -196,7 +196,7 @@ framework::OpKernelType ConvOp::GetKernelTypeForVar(
     auto ar = paddle::framework::AttrReader(attrs);
     const std::string data_format = ar.Get<std::string>("data_format");
     auto dl = framework::StringToDataLayout(data_format);
-    // Some models may have intentionally set "AnyLayout" for pool
+    // Some models may have intentionally set "AnyLayout" for conv
     // op. Treat this as NCHW (default data_format value)
     if (dl != framework::DataLayout::kAnyLayout) {
       return framework::OpKernelType(expected_kernel_type.data_type_,
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index b44aa4ce4f893720ef55a7daf1d7b1e757c7480c..25e887ba6675e6c28bcd44c3b57c2ea571c075e3 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -37,6 +37,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   auto filter_dims = ctx->GetInputDim("Filter");
   std::vector<int> output_size =
       ctx->Attrs().Get<std::vector<int>>("output_size");
+  std::vector<int> output_padding =
+      ctx->Attrs().Get<std::vector<int>>("output_padding");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
@@ -78,6 +80,12 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument(
             "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
             "should be the same."));
+  if (output_padding.size())
+    PADDLE_ENFORCE_EQ(
+        output_padding.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
 
   const int64_t C =
       (data_layout != DataLayout::kNHWC ? in_dims[1]
@@ -136,6 +144,27 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                 infer_shape + strides[i]));
       }
       output_shape.push_back(output_size[i]);
+    } else if (output_padding.size()) {
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_GE(
+            output_padding[i], 0,
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should not be "
+                "less than the 0. But received output_padding = "
+                "[%s], whose dim %d is less than 0",
+                framework::make_ddim(output_padding), i));
+        PADDLE_ENFORCE_LT(
+            output_padding[i], std::max(strides[i], dilations[i]),
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should be less "
+                "than either stride or dilation. But received output_size = "
+                "[%s], "
+                "whose dim %d is not less than either stride (%d)  or "
+                "dilation (%d)",
+                framework::make_ddim(output_size), i, strides[i],
+                dilations[i]));
+      }
+      output_shape.push_back((infer_shape + output_padding[i]));
     } else {
       output_shape.push_back(infer_shape);
     }
@@ -223,10 +252,14 @@ void Conv2DTransposeOpMaker::Make() {
            "The format of output tensor is X (one-dimensional) of size equal"
            "to the number of output channels. Only used with MKL-DNN.")
       .AsDispensable();
-
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is the same as input tensor.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
   AddAttr<std::vector<int>>("output_size",
                             "(vector<int> default: []), the "
                             "size of the output tensor")
@@ -338,6 +371,11 @@ void Conv3DTransposeOpMaker::Make() {
             "Where N is batch size, C is "
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
   AddAttr<std::vector<int>>("output_size",
                             "(vector<int> default: []), the "
                             "size of the output tensor")
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 16e2ca464b5c4de6aa65109cd794d17e4dcd6a2a..7081490fd1bf0e26cb8aa90d69a76a5476cef044 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -24,34 +24,62 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(Weight) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
-                   "Input(Cache) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
-                   "Output(last_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
-                   "Output(last_c) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Reserve"), "Output", "Reserve", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("StateOut"), "Output", "StateOut",
+                   "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("LastH"), "Output", "LastH", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("LastC"), "Output", "LastC", "CudnnLSTM");
 
     auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
+    auto init_dims = ctx->GetInputDim("InitH");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of Input in CudnnLSTM  must be 3. But "
+                          "received Input's rank is %d.",
+                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(init_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of InitH in CudnnLSTM  must be 3. But "
+                          "received InitH's rank is %d.",
+                          init_dims.size()));
+
+    PADDLE_ENFORCE_EQ(in_dims[1], init_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The in_dims[1] (Input dims) and init_dims[1] (InitH "
+                          "dims) should be equal. But "
+                          "received in_dims[1] is %d and init_dims[1] is %d.",
+                          in_dims[1], init_dims[1]));
+    PADDLE_ENFORCE_EQ(in_dims[2], init_dims[2],
+                      platform::errors::InvalidArgument(
+                          "The in_dims[2] (Input dims) and init_dims[2] (InitH "
+                          "dims) should be equal. But "
+                          "received in_dims[2] is %d and init_dims[2] is %d.",
+                          in_dims[2], init_dims[2]));
 
     auto out_dims = in_dims;
     auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
-    out_dims[2] = hidden_size;
+    bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
+    out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
 
+    auto last_dims = init_dims;
+    last_dims[0] = is_bidirec ? last_dims[0] * 2 : last_dims[0];
     ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
-    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
+    ctx->SetOutputDim("LastH", last_dims);
+    ctx->SetOutputDim("LastC", last_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
   }
 };
 
@@ -84,33 +112,31 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
              " cudnn concatenate all the weight to one Tensor");
-    AddInput("Cache",
-             "The cache of dropout op, a RAW type variable including random "
-             "number generator states and some descriptors, which is used in "
-             "cudnn kernel.")
-        .AsDispensable();
+    AddOutput("Reserve",
+              "(Tensor, a temporary output Tensor to store the reserve_data "
+              "of cudnn kernel.")
+        .AsIntermediate();
+    AddOutput("StateOut",
+              "Share memory with State. "
+              "Store the global drop state when training");
     AddOutput("Out",
               "(Tensor) the hidden state of LSTM operator. "
               "The shape is ( seq_len x batch_size x hidden_size) if "
               "is_bidirec is False"
               "and When is_bidirec is True, the shape will be ( seq_len x "
               "batch_size x hidden_size * 2) ");
-    AddOutput("last_h",
+    AddOutput("LastH",
               "(Tensor) the hidden state of the last step. "
               "The shape is ( num_layers x batch_size x hidden_size) if "
               "is_bidirec is False"
               "and When is_bidirec is True, the shape will be (num_layers*2 x "
               "batch_size x hidden_size)");
-    AddOutput("last_c",
+    AddOutput("LastC",
               "(Tensor) the cell state of the last step"
               "The shape is ( num_layers x batch_size x hidden_size) if "
               "is_bidirec is False"
               "and When is_bidirect is True, the shape will be (num_layers*2 x "
               "batch_size x hidden_size*2)");
-    AddAttr<int>("max_len",
-                 "max length of the LSTM op"
-                 "the first dim of the Input can NOT be greater than max_len")
-        .SetDefault(20);
     AddAttr<float>(
         "dropout_prob",
         "dropout prob of the dropout op"
@@ -120,14 +146,14 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("is_bidirec",
                   "is_bidirec"
                   "if it is bidirectional rnn"
-                  "The will affect the shape of the Out, last_h, and last_c")
+                  "The will affect the shape of the Out, LastH, and LastC")
         .SetDefault(false);
     AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
     AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
     AddAttr<int>("num_layers", "the total layer number of the LSTM")
         .SetDefault(1);
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
     AddComment(R"DOC(
 CUDNN LSTM implementation
 
@@ -172,16 +198,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
-                   "Input(last_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
+    OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
+    OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
 
     auto SetOutGradDim = [&ctx](const std::string& name) {
       auto g_name = framework::GradVarName(name);
@@ -195,6 +215,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
     SetOutGradDim("InitH");
     SetOutGradDim("InitC");
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
 };
 
 template <typename T>
@@ -209,13 +235,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
     op->SetInput("W", this->Input("W"));
-    if (this->HasInput("Cache")) {
-      op->SetInput("Cache", this->Input("Cache"));
-    }
+    op->SetInput("Reserve", this->Output("Reserve"));
+    op->SetInput("StateOut", this->Output("StateOut"));
     op->SetInput("Out", this->Output("Out"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput(framework::GradVarName("last_c"), this->OutputGrad("last_c"));
-    op->SetInput(framework::GradVarName("last_h"), this->OutputGrad("last_h"));
+    op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
+    op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
 
     op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
     op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 579dddee8e82183b778f03595bb4657002262073..37e5e518ea2af9bb437775c8fa7e86816bb1d8ae 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
 
 namespace paddle {
 namespace operators {
@@ -33,8 +34,10 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     auto w = ctx.Input<Tensor>("W");
 
     Tensor *out = ctx.Output<Tensor>("Out");
-    Tensor *last_h = ctx.Output<Tensor>("last_h");
-    Tensor *last_c = ctx.Output<Tensor>("last_c");
+    Tensor *last_h = ctx.Output<Tensor>("LastH");
+    Tensor *last_c = ctx.Output<Tensor>("LastC");
+    Tensor *reserve = ctx.Output<Tensor>("Reserve");
+    Tensor *state_out = ctx.Output<Tensor>("StateOut");
 
     const T *x_data = x->data<T>();
     const T *init_h_data = init_h->data<T>();
@@ -46,72 +49,56 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
     T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
 
-    size_t max_len = ctx.Attr<int>("max_len");
     float dropout_prob = ctx.Attr<float>("dropout_prob");
     bool is_bidirec = ctx.Attr<bool>("is_bidirec");
-    int input_size = ctx.Attr<int>("input_size");
     int hidden_size = ctx.Attr<int>("hidden_size");
     int num_layers = ctx.Attr<int>("num_layers");
     bool is_test = ctx.Attr<bool>("is_test");
+    int seed = ctx.Attr<int>("seed");
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    if (!cache_var) {
-      // The RAW type cache variable wouldn't be created and broadcasted on
-      // multi-devices before the first running.
-      // use parent scope to make cache persistable
-      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
-      auto cache_var_name = ctx.InputNames("Cache")[0];
-      cache_var = scope->Var(cache_var_name);
-    }
-    CudnnRNNCache *cudnn_rnn_cache = nullptr;
-    if (cache_var->IsInitialized()) {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-    } else {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-      std::random_device rnd;
-      int seed = ctx.Attr<int>("seed");
-      if (seed == -1) {
-        seed = rnd();
-      }
-
-      auto input_w_numel = w->numel();
-      auto batch_size = x->dims()[1];
-      cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
-                            input_size, hidden_size, num_layers, dropout_prob,
-                            is_bidirec, seed, input_w_numel);
-    }
 
-    auto run_seq_len = x->dims()[0];
+    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+
+    auto input_w_numel = w->numel();
+    auto seq_len = x->dims()[0];
+    auto batch_size = x->dims()[1];
+    auto input_dim = x->dims()[2];
+    size_t reserve_size;
+    bool state_initialized = state_out->IsInitialized() ? true : false;
+    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
+        framework::ToDataType(std::type_index(typeid(T))));
+    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
+                          input_dim, hidden_size, num_layers, dropout_prob,
+                          is_bidirec, seed, input_w_numel, &reserve_size,
+                          state_out, state_initialized, cudnn_type);
+
+    auto *reserve_data = reserve->mutable_data<uint8_t>(
+        {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
 
     if (is_test) {
       // for inference
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
+          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
+          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
+          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
+          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
           cudnn_rnn_cache->workspace_size_));
     } else {
       // for train
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_,
-          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-          cudnn_rnn_cache->reserve_size_));
+          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
+          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
+          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
+          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
+          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_, reserve_data, reserve_size));
     }
+    delete cudnn_rnn_cache;
   }
 };
 
@@ -123,15 +110,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *weight = ctx.Input<Tensor>("W");
     auto *init_h = ctx.Input<Tensor>("InitH");
     auto *init_c = ctx.Input<Tensor>("InitC");
-    // auto * last_h = ctx.Input<Tensor>("last_h");
-    // auto * last_c = ctx.Input<Tensor>("last_c");
+    auto *reserve = ctx.Input<Tensor>("Reserve");
+    auto *state_out = ctx.Input<Tensor>("StateOut");
+
     auto *out = ctx.Input<Tensor>("Out");
     auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
-    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
-
-    // auto* init_h = ctx.Input<Tensor>("init_h");
-    // auto* init_c = ctx.Input<Tensor>("init_c");
+    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("LastH"));
+    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
 
     auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
@@ -140,116 +125,75 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    PADDLE_ENFORCE(cache_var->IsInitialized());
-    CudnnRNNCache *cudnn_rnn_cache =
-        const_cast<framework::Variable *>(cache_var)
-            ->GetMutable<CudnnRNNCache>();
 
     auto input_dims = input->dims();
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    weight_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-
-    T *init_h_grad_data = NULL;
-    if (init_h_grad == nullptr) {
-      Tensor init_h_grad_temp;
-      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
-
-      init_h_grad_data = init_h_grad_temp.data<T>();
-    } else {
-      init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
-      init_h_grad_data = init_h_grad->data<T>();
-    }
-
-    T *init_c_grad_data = NULL;
-    if (init_c_grad == nullptr) {
-      Tensor init_c_grad_temp;
-      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
 
-      init_c_grad_data = init_c_grad_temp.data<T>();
-    } else {
-      init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
-      init_c_grad_data = init_c_grad->data<T>();
-    }
+    auto *weight_data = weight->data<T>();
+    auto *init_h_data = init_h->data<T>();
+    auto *init_c_data = init_c->data<T>();
+    auto *out_data = out->data<T>();
+    auto *out_grad_data = out_grad->data<T>();
+    auto *last_h_grad_data = last_h_grad->data<T>();
+    auto *last_c_grad_data = last_c_grad->data<T>();
 
-    const T *last_h_grad_data = NULL;
-    if (last_h_grad == nullptr) {
-      Tensor last_h_grad_temp;
-      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
-
-      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
-    } else {
-      last_h_grad_data = last_h_grad->data<T>();
-    }
-
-    const T *last_c_grad_data = NULL;
-    if (last_c_grad == nullptr) {
-      Tensor last_c_grad_temp;
-      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
-
-      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
-    } else {
-      last_c_grad_data = last_c_grad->data<T>();
-    }
+    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    weight_grad->mutable_data<T>(ctx.GetPlace());
+    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
 
-    const T *out_grad_data = NULL;
-    if (out_grad == nullptr) {
-      Tensor out_grad_temp;
-      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
-      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
+    in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
+    auto *in_grad_data = in_grad->data<T>();
 
-      out_grad_data = (const T *)out_grad_temp.data<T>();
-    } else {
-      out_grad_data = out_grad->data<T>();
-    }
+    init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+    auto *init_h_grad_data = init_h_grad->data<T>();
 
-    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
-    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
+    init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+    auto *init_c_grad_data = init_c_grad->data<T>();
 
-    auto out_data = out->data<T>();
-    // auto out_grad_data = out_grad->data<T>();
-    auto weight_data = weight->data<T>();
-    auto init_h_data = init_h->data<T>();
-    auto init_c_data = init_c->data<T>();
-    auto in_grad_data = in_grad->data<T>();
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    int seed = ctx.Attr<int>("seed");
+
+    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+
+    auto input_w_numel = weight->numel();
+    auto seq_len = input_dims[0];
+    auto batch_size = input->dims()[1];
+    auto input_dim = input->dims()[2];
+    size_t reserve_size;
+    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
+        framework::ToDataType(std::type_index(typeid(T))));
+    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
+                          input_dim, hidden_size, num_layers, dropout_prob,
+                          is_bidirec, seed, input_w_numel, &reserve_size,
+                          const_cast<Tensor *>(state_out), true, cudnn_type);
 
     auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
-    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
+    const uint8_t *reserve_data = reserve->data<uint8_t>();
 
-    auto run_seq_len = input_dims[0];
-    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
-                      "cudnn running seq_len CAN not greater max_lengh");
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
-        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
-        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
-        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
-        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
-        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
-        cudnn_rnn_cache->workspace_size_, reserve_data,
-        cudnn_rnn_cache->reserve_size_));
+        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->y_desc_,
+        out_data, cudnn_rnn_cache->y_desc_, out_grad_data,
+        cudnn_rnn_cache->hy_desc_, last_h_grad_data, cudnn_rnn_cache->cy_desc_,
+        last_c_grad_data, cudnn_rnn_cache->w_desc_, weight_data,
+        cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_,
+        init_c_data, cudnn_rnn_cache->x_desc_, in_grad_data,
+        cudnn_rnn_cache->hx_desc_, init_h_grad_data, cudnn_rnn_cache->cx_desc_,
+        init_c_grad_data, work_data, cudnn_rnn_cache->workspace_size_,
+        const_cast<uint8_t *>(reserve_data), reserve_size));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
-        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
+        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
+        input->data<T>(), cudnn_rnn_cache->hx_desc_, init_h->data<T>(),
+        cudnn_rnn_cache->y_desc_, out->data<T>(),
         cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
-        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-        cudnn_rnn_cache->reserve_size_));
+        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->w_desc_,
+        weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+    delete cudnn_rnn_cache;
   }
 };
 
@@ -257,5 +201,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>,
+                        ops::CudnnLSTMGPUKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>,
+                        ops::CudnnLSTMGPUGradKernel<double>);
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index cd33338abc6223a0ae122cbb60f040562b48a761..13a3e7d09b9f628f31bb9ff3b6137acf6d929c5c 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
@@ -24,16 +25,12 @@ struct CudnnRNNCache {
   CudnnRNNCache() {
     x_desc_ = NULL;
     y_desc_ = NULL;
-    dx_desc_ = NULL;
-    dy_desc_ = NULL;
   }
   ~CudnnRNNCache() { release(); }
 
   cudnnRNNDescriptor_t rnn_desc_;
   cudnnTensorDescriptor_t *x_desc_;
   cudnnTensorDescriptor_t *y_desc_;
-  cudnnTensorDescriptor_t *dx_desc_;
-  cudnnTensorDescriptor_t *dy_desc_;
 
   cudnnTensorDescriptor_t hx_desc_;
   cudnnTensorDescriptor_t cx_desc_;
@@ -55,13 +52,9 @@ struct CudnnRNNCache {
   cudnnFilterDescriptor_t dw_desc_;
 
   size_t workspace_size_;
-  size_t reserve_size_;
-  framework::Tensor reserve_data_;
   framework::Tensor workspace_data_;
 
-  framework::Tensor dropout_state_;
-
-  size_t max_length_;
+  size_t seq_length_;
 
   float dropout_prob_;
   bool is_bidirec_;
@@ -72,10 +65,12 @@ struct CudnnRNNCache {
   int num_layers_;
   int seed_;
 
-  void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len,
+  void init(cudnnHandle_t handle, const platform::Place &place, size_t seq_len,
             int batch_size, int input_size, int hidden_size, int num_layers,
-            float dropout_prob, bool is_bidirec, int seed, int weight_numel) {
-    max_length_ = max_len;
+            float dropout_prob, bool is_bidirec, int seed, int weight_numel,
+            size_t *reserve_size_, framework::Tensor *dropout_state_,
+            bool initialized, cudnnDataType_t cudnn_type) {
+    seq_length_ = seq_len;
     batch_size_ = batch_size;
     input_size_ = input_size;
     hidden_size_ = hidden_size;
@@ -84,55 +79,34 @@ struct CudnnRNNCache {
     is_bidirec_ = is_bidirec;
     seed_ = seed;
 
-    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    int dim_a[3];
-    int stride_a[3];
+    const auto numDirections = is_bidirec_ ? 2 : 1;
+    auto cudnn_size =
+        cudnn_type == CUDNN_DATA_FLOAT ? sizeof(float) : sizeof(double);
+
+    x_desc_ = new cudnnTensorDescriptor_t[seq_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[seq_length_];
+    std::vector<int> dims = {batch_size_, input_size_, 1};
+    std::vector<int> strides = {input_size_, 1, 1};
+
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
 
-    for (size_t i = 0; i < max_length_; ++i) {
+    for (size_t i = 0; i < seq_length_; ++i) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
-      dim_a[0] = batch_size_;
-      dim_a[1] = input_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-      dim_a[0] = batch_size_;
-      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
 
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+          x_desc_[i], cudnn_type, 3, dims.data(), strides.data()));
+
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+          y_desc_[i], cudnn_type, 3, dims_y.data(), strides_y.data()));
     }
 
-    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
-    dim_a[1] = batch_size_;
-    dim_a[2] = hidden_size_;
-
-    stride_a[0] = dim_a[2] * dim_a[1];
-    stride_a[1] = dim_a[2];
-    stride_a[2] = 1;
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
@@ -152,33 +126,44 @@ struct CudnnRNNCache {
         platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        hx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        cx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        hy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        cy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dhx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dcx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dhy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dcy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
 
     size_t state_size;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-    dropout_state_.Resize({static_cast<int64_t>(state_size)});
-    auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
-        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
-        seed_));
+    if (!initialized) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state_->Resize({static_cast<int64_t>(state_size)});
+      uint8_t *dropout_state_data =
+          dropout_state_->mutable_data<uint8_t>(place);
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
+          dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+          seed_));
+    } else {
+      uint8_t *dropout_state_data = dropout_state_->data<uint8_t>();
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnRestoreDropoutDescriptor(
+              dropout_desc_, handle, dropout_prob_, dropout_state_data,
+              state_size, 0));
+    }
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
@@ -188,12 +173,12 @@ struct CudnnRNNCache {
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
 #else
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
         rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_DATA_FLOAT));
+        cudnn_type));
 #endif
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -202,48 +187,42 @@ struct CudnnRNNCache {
         platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+        handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
+
+    PADDLE_ENFORCE_EQ(
+        weights_size_, cudnn_size * weight_numel,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
 
-    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
-                      "cudnn lstm weight size should be SAME");
     int dim_w[3];
-    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[0] = weights_size_ / cudnn_size;
     dim_w[1] = 1;
     dim_w[2] = 1;
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
-        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+        w_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
-        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+        dw_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+        handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
-            handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
-
-    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
-    reserve_data_.mutable_data<uint8_t>(place);
+            handle, rnn_desc_, seq_length_, x_desc_, reserve_size_));
 
     workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
     workspace_data_.mutable_data<uint8_t>(place);
   }
 
   void release() {
-    for (size_t i = 0; i < max_length_; ++i) {
+    for (size_t i = 0; i < seq_length_; ++i) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
     }
 
     delete[] x_desc_;
     delete[] y_desc_;
-    delete[] dx_desc_;
-    delete[] dy_desc_;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
index e336e25f0f457d600d96c2059762b66e985a65c7..ab3860ecafc3569c13b0b9e5c882df9ddc03e190 100644
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -36,25 +36,28 @@ class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
     int axis = context.Attr<int>("axis");
     bool exclusive = context.Attr<bool>("exclusive");
     bool reverse = context.Attr<bool>("reverse");
-    auto x_dims = X.dims();
-    if (axis == -1) {
-      axis = x_dims.size() - 1;
+    auto out_dims = Out.dims();
+
+    PADDLE_ENFORCE_EQ(
+        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
+        platform::errors::OutOfRange(
+            "Attr(axis) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+            out_dims.size(), out_dims.size() - 1, axis));
+    if (axis < 0) {
+      axis += out_dims.size();
     }
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument("axis(%d) should be less than the "
-                                          "dimension(%d) of the input tensor.",
-                                          axis, x_dims.size()));
+
     Out.template mutable_data<T>(context.GetPlace());
 
     int pre = 1;
     int post = 1;
-    int mid = x_dims[axis];
+    int mid = out_dims[axis];
     for (int i = 0; i < axis; ++i) {
-      pre *= x_dims[i];
+      pre *= out_dims[i];
     }
-    for (int i = axis + 1; i < x_dims.size(); ++i) {
-      post *= x_dims[i];
+    for (int i = axis + 1; i < out_dims.size(); ++i) {
+      post *= out_dims[i];
     }
 
     auto x = framework::EigenVector<T>::Flatten(X);
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 962d73d068985a3579cb698a5a545f2c0b0503dc..89ec1ddd12b9d8da8dba604ae4e759054212608e 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/cum_op.h"
 
 namespace paddle {
@@ -22,7 +23,14 @@ class CumOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<bool>("flatten")) {
+      ctx->SetOutputDim(
+          "Out",
+          framework::make_ddim({framework::product(ctx->GetInputDim("X"))}));
+    } else {
+      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    }
+
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
@@ -35,8 +43,11 @@ class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("axis",
                  "The dimension to accumulate along. -1 means the last "
                  "dimension [default -1].")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
+        .SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Whether to compute the cumsum over the flattened array. "
+                  "[default false].")
+        .SetDefault(false);
     AddAttr<bool>("exclusive",
                   "Whether to perform exclusive cumsum. [default false].")
         .SetDefault(false);
@@ -63,6 +74,8 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetInput("X", this->OutputGrad("Out"));
     grad_op->SetOutput("Out", this->InputGrad("X"));
     grad_op->SetAttr("axis", BOOST_GET_CONST(int, this->GetAttr("axis")));
+    grad_op->SetAttr("flatten",
+                     BOOST_GET_CONST(bool, this->GetAttr("flatten")));
     grad_op->SetAttr("reverse",
                      !BOOST_GET_CONST(bool, this->GetAttr("reverse")));
     grad_op->SetAttr("exclusive",
@@ -83,3 +96,14 @@ REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
+
+REGISTER_OP_VERSION(cumsum)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade cumsum add a new attribute [flatten].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "flatten",
+            "In order to compute the cumsum over the flattened array when the "
+            "argument `axis` in python API is None.",
+            false));
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index 7ca5ba3289b26f9b01774b1ab0e85f075c4cfc90..cff0a101e03d54bffd172177726340b3d75b9fe9 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
 #include "paddle/fluid/operators/cum_op.h"
 #include "paddle/fluid/platform/gpu_launch_param_config.h"
 
@@ -251,34 +255,62 @@ class CumCUDAKernel : public framework::OpKernel<T> {
     int axis = context.Attr<int>("axis");
     bool exclusive = context.Attr<bool>("exclusive");
     bool reverse = context.Attr<bool>("reverse");
-    auto in_dims = in->dims();
+    auto out_dims = out->dims();
     auto size = in->numel();
 
-    if (axis == -1) {
-      axis = in_dims.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
+        platform::errors::OutOfRange(
+            "Attr(axis) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+            out_dims.size(), out_dims.size() - 1, axis));
+    if (axis < 0) {
+      axis += out_dims.size();
     }
-    PADDLE_ENFORCE_LT(
-        axis, in_dims.size(),
-        platform::errors::InvalidArgument("axis(%d) should be less than the "
-                                          "dimension(%d) of the input tensor.",
-                                          axis, in_dims.size()));
-
-    int scan_dim_size = in_dims[axis];
-    bool optimize_condition = (axis == (in_dims.size() - 1)) ? true : false;
+
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* in_data = in->data<T>();
+
+    // Use thrust for parallel acceleration when the input size is equal to the
+    // length of the ‘axis’ dimension.
+    if (size == out_dims[axis]) {
+      if (reverse) {
+        thrust::device_ptr<const T> dev_ptr =
+            thrust::device_pointer_cast(in_data);
+        thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
+        if (exclusive) {
+          thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
+                                 out_data);
+        } else {
+          thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
+                                 out_data);
+        }
+        thrust::reverse(thrust::device, out_data, out_data + size);
+      } else {
+        if (exclusive) {
+          thrust::exclusive_scan(thrust::device, in_data, in_data + size,
+                                 out_data);
+        } else {
+          thrust::inclusive_scan(thrust::device, in_data, in_data + size,
+                                 out_data);
+        }
+      }
+      return;
+    }
+
+    const int& scan_dim_size = out_dims[axis];
+    bool optimize_condition = (axis == (out_dims.size() - 1)) ? true : false;
     int outer_dim_size = 1;
     int inner_dim_size = 1;
     // treat all dim index < axis as outer_dim_size
     for (size_t i = 0; i < axis; i++) {
-      outer_dim_size *= in_dims[i];
+      outer_dim_size *= out_dims[i];
     }
     // treat all dim index > axis as innner_dim_size
-    for (size_t i = axis + 1; i < in_dims.size(); i++) {
-      inner_dim_size *= in_dims[i];
+    for (size_t i = axis + 1; i < out_dims.size(); i++) {
+      inner_dim_size *= out_dims[i];
     }
 
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* in_data = in->data<T>();
-
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (optimize_condition) {
       auto nextPowerOfTwo = [](int x) -> int {
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index c6140483ff5cb8108895546b6a01f058708231fd..956fd5ad3035434fbf3093786319b3f7ab7e7354 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -68,8 +68,19 @@ class CVMOpKernel : public framework::OpKernel<T> {
 
     // for Input X do not have Lod Information.
     if (x->NumLevels() == 0) {
-      for (int i = 0; i < batch_size; i++) {
-        CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+      if (use_cvm) {
+        for (int i = 0; i < batch_size; i++) {
+          int cursor = i * item_size;
+          y_data[cursor] = log(x_data[cursor] + 1);
+          y_data[cursor + 1] = log(x_data[cursor + 1] + 1) - y_data[cursor];
+          for (int j = 2; j < item_size; j++) {
+            y_data[cursor + j] = x_data[cursor + j];
+          }
+        }
+      } else {
+        for (int i = 0; i < batch_size; i++) {
+          CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+        }
       }
     } else {
       auto lod = x->lod()[0];
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 16e1699e12c832d54af14f673577dcc32b015d6d..5cd853758926e622d0f87e6f8bbaba2cf3b9f85e 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -222,10 +222,12 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     } else {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
-        if (type == "per_prediction") {
-          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        if (lod[i + 1] > lod[i]) {
+          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+          BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+          if (type == "per_prediction") {
+            ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67dc2843345682b2dfe3d568e452461860575544
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/diag_v2_op.h"
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class DiagV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "diag_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag_v2");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto offset = ctx->Attrs().Get<int>("offset");
+
+    if (x_dims.size() == 1UL) {
+      int64_t size = x_dims[0] + std::abs(offset);
+      ctx->SetOutputDim("Out", {size, size});
+    } else if (x_dims.size() == 2UL) {
+      int64_t size;
+      if (offset >= 0) {
+        size = std::min(x_dims[0], x_dims[1] - offset);
+      } else {
+        size = std::min(x_dims[0] + offset, x_dims[1]);
+      }
+      ctx->SetOutputDim("Out", {size});
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+          "2, but received %d.",
+          x_dims.size()));
+    }
+  }
+};
+
+class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor. Its shape is either 1-D or 2-D.");
+    AddOutput("Out", "The output tensor. A square matrix or a vector.");
+    AddAttr<int>("offset",
+                 "The diagonal offset. A positive value represents "
+                 "superdiagonal, 0 represents the main diagonal, and a "
+                 "negative value represents subdiagonal.")
+        .SetDefault(0);
+    AddAttr<float>("padding_value",
+                   "Use this value to fill the area outside the specified "
+                   "diagonal band. Only takes effect when the input is a 1-D "
+                   "Tensor. The default value is 0.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+      If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+      If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
+
+      The argument ``offset`` controls the diagonal offset:
+
+      If ``offset`` = 0, it is the main diagonal.
+
+      If ``offset`` > 0, it is superdiagonal.
+
+      If ``offset`` < 0, it is subdiagonal.
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DiagV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* x_data = X->data<T>();
+    auto x_dims = X->dims();
+    int offset = context.Attr<int>("offset");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+
+    int64_t i;
+    if (x_dims.size() == 1) {
+      float padding_value = context.Attr<float>("padding_value");
+      math::SetConstant<DeviceContext, T> set_padding_value;
+      auto& dev_ctx = context.template device_context<DeviceContext>();
+      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+      auto x_length = x_dims[0];
+      const int& x_stride = ComputeStride(0, x_dims);
+
+      auto out_stride_0 = ComputeStride(0, out_dims);
+      auto out_stride_1 = ComputeStride(1, out_dims);
+      out_data +=
+          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+      for (i = 0; i < x_length; i++) {
+        out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
+      }
+    } else {
+      auto out_length = out_dims[0];
+      const int& x_stride_0 = ComputeStride(0, x_dims);
+      const int& x_stride_1 = ComputeStride(1, x_dims);
+
+      auto out_stride_0 = ComputeStride(0, out_dims);
+      x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+      for (i = 0; i < out_length; i++) {
+        out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    diag_v2, ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4386cc6b8183c03b4d4a19aba7d1126eac2ab495
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -0,0 +1,122 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/diag_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+// Extract the diagonal of a matrix 'x' to a vector 'out'.
+template <typename T>
+__global__ void ExtractDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
+                                      std::ptrdiff_t size,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t xOffset = start + sumStride * idx;
+    out[outStride * idx] = x[xOffset];
+  }
+}
+
+// Paste a vector 'x' to the diagonal of a matrix 'out'
+template <typename T>
+__global__ void PasteDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
+                                    std::ptrdiff_t x_length,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < x_length; idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    out[outOffset] = x[xStride * idx];
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DiagV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* x_data = X->data<T>();
+    auto x_dims = X->dims();
+    int offset = context.Attr<int>("offset");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (x_dims.size() == 1) {
+      float padding_value = context.Attr<float>("padding_value");
+      math::SetConstant<DeviceContext, T> set_padding_value;
+      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+      auto x_length = x_dims[0];
+      auto size = (offset > 0) ? x_length + offset : x_length - offset;
+      const int& x_stride = ComputeStride(0, x_dims);
+      if (size > 0) {
+        const int block_num = std::min(static_cast<int>(size),
+                                       dev_ctx.GetMaxPhysicalThreadCount());
+        int size_ = static_cast<int>(size);
+        int block_num_ = static_cast<int>(block_num);
+        const int grid_num =
+            std::min(1024, (size_ + block_num_ - 1) / block_num_);
+        const auto& out_stride_0 = ComputeStride(0, out_dims);
+        const auto& out_stride_1 = ComputeStride(1, out_dims);
+        auto start =
+            (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+        PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
+            out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
+            x_stride);
+      }
+    } else {
+      const int& x_stride_0 = ComputeStride(0, x_dims);
+      const int& x_stride_1 = ComputeStride(1, x_dims);
+
+      int size;
+      if (offset > 0) {
+        size = std::min(x_dims[0], x_dims[1] - offset);
+      } else {
+        size = std::min(x_dims[0] + offset, x_dims[1]);
+      }
+
+      if (size > 0) {
+        const int block_num = std::min(static_cast<int>(size),
+                                       dev_ctx.GetMaxPhysicalThreadCount());
+        int size_ = static_cast<int>(size);
+        int block_num_ = static_cast<int>(block_num);
+        const int grid_num =
+            std::min(1024, (size_ + block_num_ - 1) / block_num_);
+        auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+        const auto& out_stride_0 = ComputeStride(0, out_dims);
+
+        ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
+            out_data, x_data, start, size, x_stride_0 + x_stride_1,
+            out_stride_0);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    diag_v2, ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/fluid/operators/diag_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7850def06117ff4232afe4fca95a3e3e500e876d
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using DDim = framework::DDim;
+
+static inline int ComputeStride(int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index cff3993a068ceee1947ca3e17b9cc6a75e3c9ba9..a033611f478f9ea44fd49ab2015e78aaea6aacd9 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -61,7 +61,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
 cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index fb7a0691154de768d4b828ee5d7b6a47755225f4..0d7032e286caab93dbd38f35881e9064694a8307 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -28,6 +28,7 @@
 #include <thread>  // NOLINT
 
 #include <ThreadPool.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -96,7 +97,12 @@ class UniformInitializer : public Initializer {
     dist_ = std::uniform_real_distribution<float>(min_, max_);
   }
 
-  float GetValue() override { return dist_(random_engine_); }
+  float GetValue() override {
+    return framework::Generator::GetInstance()->is_init_py
+               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
+               : dist_(random_engine_);
+    // return dist_(random_engine_);
+  }
 
  private:
   float min_;
@@ -141,7 +147,12 @@ class GaussianInitializer : public Initializer {
     dist_ = std::normal_distribution<float>(mean_, std_);
   }
 
-  float GetValue() override { return dist_(random_engine_); }
+  float GetValue() override {
+    return framework::Generator::GetInstance()->is_init_py
+               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
+               : dist_(random_engine_);
+    // return dist_(random_engine_);
+  }
 
  private:
   float std_;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 5a67b358ddabb12566cd4ffe00cb12c65a185099..a9378d61c3ca39bd43b558633cc4d04c40175cac 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -110,7 +110,7 @@ void prefetch_core(
   int pservers = context.Attr<int>("pserver_num");
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &actual_ctx = *pool.Get(context.GetPlace());
+  auto &actual_ctx = *pool.Get(platform::CPUPlace());
 
   std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
@@ -144,7 +144,6 @@ void prefetch_core(
       VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
     }
   }
-
   for (size_t i = 0; i < rets.size(); i++) {
     PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
                                                "internal error in RPCClient"));
@@ -167,6 +166,7 @@ void prefetch_core(
       for (int64_t i = 0; i < dims[0]; ++i) {
         auto origin_id = ids_in_this_section[i];
         std::vector<float> vecs(row_numel);
+
         std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
         (*recved_vec_map)[origin_id] = vecs;
       }
@@ -213,18 +213,18 @@ void prefetchs(const std::vector<std::string> &id_var_names,
   const auto place =
       scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
 
-  if (!platform::is_cpu_place(place)) {
-    PADDLE_THROW("multi prefetch only support CPU currently");
-  }
-
+  std::vector<std::vector<int64_t>> ids_group;
   std::vector<int64_t> ids_union;
+  std::vector<framework::LoD> ids_lods;
   TableAndEndpoints tables;
 
   for (auto &id_name : id_var_names) {
-    auto *in_var = scope.FindVar(id_name);
-    auto &id_tensor = in_var->Get<framework::LoDTensor>();
-    std::copy_n(id_tensor.data<int64_t>(), id_tensor.numel(),
-                back_inserter(ids_union));
+    auto &id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+    std::vector<int64_t> ids;
+    TensorToVector(id_tensor, context.device_context(), &ids);
+    ids_union.insert(ids_union.end(), ids.begin(), ids.end());
+    ids_group.push_back(ids);
+    ids_lods.push_back(id_tensor.lod());
   }
 
   std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
@@ -258,25 +258,48 @@ void prefetchs(const std::vector<std::string> &id_var_names,
   }
 
   for (size_t i = 0; i < out_var_names.size(); i++) {
-    auto *in_var = scope.FindVar(id_var_names[i]);
-    auto &id_tensor = in_var->Get<framework::LoDTensor>();
-    auto ids_size = id_tensor.dims()[0];
-    const auto *id_data = id_tensor.data<int64_t>();
-
+    std::vector<int64_t> ids = ids_group[i];
+    auto ids_size = ids.size();
     auto *out_t =
         scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->set_lod(id_tensor.lod());
-    out_t->Resize(framework::make_ddim({ids_size, vec_dim_1}));
+    out_t->set_lod(ids_lods[i]);
+    out_t->Resize(
+        framework::make_ddim({static_cast<int64_t>(ids_size), vec_dim_1}));
     auto *out_d = out_t->mutable_data<float>(place);
 
-    for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-      const auto &id = id_data[idx];
-      if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-        memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-      } else {
-        std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                    out_d + idx * vec_dim_1);
+    if (platform::is_cpu_place(out_t->place())) {
+      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
+        const auto &id = ids[idx];
+        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
+          memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
+        } else {
+          std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
+                      out_d + idx * vec_dim_1);
+        }
+      }
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
+        const auto &id = ids[idx];
+        auto stream = context.cuda_device_context().stream();
+        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
+          platform::GpuMemsetAsync(out_d + idx * vec_dim_1, 0,
+                                   sizeof(float) * vec_dim_1, stream);
+        } else {
+          auto &cpu_place =
+              BOOST_GET_CONST(platform::CPUPlace,
+                              paddle::platform::CPUDeviceContext().GetPlace());
+          auto &gpu_place =
+              BOOST_GET_CONST(platform::CUDAPlace, out_t->place());
+          memory::Copy(gpu_place, out_d + idx * vec_dim_1, cpu_place,
+                       &recved_vec_map[id][0], sizeof(float) * vec_dim_1,
+                       stream);
+        }
       }
+#else
+      PADDLE_ENFORCE(true, platform::errors::PermissionDenied(
+                               "Paddle is not compiled with GPU!"));
+#endif
     }
   }
 }
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
index 3037a63b0d7b4e8812e67fdfb776f89ea43eb546..8c093d12585981ee681ae13f0d2e493197c6b9b3 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -75,47 +73,6 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.InputNames("Ids");
-    auto embedding_name = context.InputNames("W").front();
-    auto out_names = context.OutputNames("Outputs");
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-    auto is_distributed = context.Attr<bool>("is_distributed");
-
-    auto lookup_table_version =
-        context.Attr<std::string>("lookup_table_version");
-
-    operators::distributed::prefetchs(id_names, out_names, embedding_name,
-                                      is_distributed, lookup_tables, endpoints,
-                                      context, context.scope());
-
-    if (lookup_table_version == "lookup_table_v2") {
-      auto &scope = context.scope();
-      auto emb_dim =
-          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
-
-      for (size_t i = 0; i < id_names.size(); ++i) {
-        auto *id_var = scope.FindVar(id_names[i]);
-        auto *out_var = scope.FindVar(out_names[i]);
-        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
-        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-        auto id_dims = id_tensor->dims();
-        out_tensor->Resize(framework::make_ddim(
-            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
-             static_cast<int64_t>(emb_dim)}));
-      }
-    }
-  }
-};
-
 class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -170,15 +127,12 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
-
 This operator is used to perform lookup on parameter W,
 then concatenated into a sparse tensor.
-
 The type of Ids(Input) is SelectedRows, the rows of Ids contains
 the ids to be looked up in W;
 if the Id is not in the sparse table, this operator will return a
 random value and set the value into the table for the next looking up.
-
 )DOC");
   }
 };
@@ -191,4 +145,5 @@ REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
                   ops::DistributedLookupTableOpMaker);
 
 REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<float>);
+                       ops::DistributedLookupTableKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c894415096e869f363eda6a1de2a473e839263
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    distributed_lookup_table,
+    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a71451c78a870b71c05b41bdcfb34a85b3e2213b
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DistributedLookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto ids_vars = context.MultiInputVar("Ids");
+    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
+
+    auto id_names = context.InputNames("Ids");
+    auto embedding_name = context.InputNames("W").front();
+    auto out_names = context.OutputNames("Outputs");
+    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
+    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
+    auto is_distributed = context.Attr<bool>("is_distributed");
+
+    operators::distributed::prefetchs(id_names, out_names, embedding_name,
+                                      is_distributed, lookup_tables, endpoints,
+                                      context, context.scope());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
index ccc30d1ea082a6f69b71059631247144c931116e..d194fcda36a474fa208f5d5a67e425ba5a5a3303 100644
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
@@ -44,7 +44,7 @@ class RecvSaveOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 2580b00d7c2bdfae726b924bb51de199586e12c3..cec706300d77b2b0e66e5b682dfad8536b5dc401 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -26,6 +26,86 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename DeviceContext, typename T>
+void DotGradFunction(const Tensor* tensor_x, const Tensor* tensor_y,
+                     const Tensor* tensor_dout, Tensor* tensor_dx,
+                     Tensor* tensor_dy,
+                     const paddle::framework::ExecutionContext& ctx) {
+#ifdef __NVCC__
+  if (1 == tensor_dout->dims().size()) {
+    auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
+
+    if (tensor_dx) {
+      auto y = framework::EigenVector<T>::Flatten(*tensor_y);
+      auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 1> size(tensor_dx->numel());
+      dx.device(dev) = y * dout.broadcast(size);
+    }
+
+    if (tensor_dy) {
+      auto x = framework::EigenVector<T>::Flatten(*tensor_x);
+      auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 1> size(tensor_dy->numel());
+      dy.device(dev) = x * dout.broadcast(size);
+    }
+  } else {
+    auto dout = EigenMatrix<T>::From(*tensor_dout);
+
+    if (tensor_dx) {
+      tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto y = EigenMatrix<T>::From(*tensor_y);
+      auto dx = EigenMatrix<T>::From(*tensor_dx);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
+      dx.device(dev) = y * dout.broadcast(size);
+    }
+
+    if (tensor_dy) {
+      tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto x = EigenMatrix<T>::From(*tensor_x);
+      auto dy = EigenMatrix<T>::From(*tensor_dy);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
+      dy.device(dev) = x * dout.broadcast(size);
+    }
+  }
+#else
+  const auto* data_dout = tensor_dout->data<T>();
+
+  if (tensor_dx) {
+    auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+    const auto* data_y = tensor_y->data<T>();
+    const framework::DDim& dim = tensor_x->dims();
+    size_t N = static_cast<size_t>(framework::product(dim));
+
+    auto step = dim[dim.size() - 1];
+
+    int s = -1;
+    for (size_t i = 0; i < N; ++i) {
+      if (0 == i % step) ++s;
+      data_dx[i] = data_y[i] * data_dout[s];
+    }
+  }
+
+  if (tensor_dy) {
+    auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+    const auto* data_x = tensor_x->data<T>();
+    const framework::DDim& dim = tensor_y->dims();
+    size_t N = static_cast<size_t>(framework::product(dim));
+
+    auto step = dim[dim.size() - 1];
+
+    int s = -1;
+    for (size_t i = 0; i < N; ++i) {
+      if (0 == i % step) ++s;
+      data_dy[i] = data_x[i] * data_dout[s];
+    }
+  }
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
  public:
@@ -84,83 +164,9 @@ class DotGradKernel : public framework::OpKernel<T> {
 
     if (tensor_dx) tensor_dx->mutable_data<T>(ctx.GetPlace());
     if (tensor_dy) tensor_dy->mutable_data<T>(ctx.GetPlace());
-#ifdef __NVCC__
-    if (1 == tensor_dout->dims().size()) {
-      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
-
-      if (tensor_dx) {
-        auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dx->numel());
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dy->numel());
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    } else {
-      auto dout = EigenMatrix<T>::From(*tensor_dout);
-
-      if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
-        auto y = EigenMatrix<T>::From(*tensor_y);
-        auto dx = EigenMatrix<T>::From(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
-        auto x = EigenMatrix<T>::From(*tensor_x);
-        auto dy = EigenMatrix<T>::From(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    }
-#else
-    const auto* data_dout = tensor_dout->data<T>();
-
-    if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_y = tensor_y->data<T>();
-      const framework::DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
 
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = data_y[i] * data_dout[s];
-      }
-    }
-
-    if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_x = tensor_x->data<T>();
-      const framework::DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = data_x[i] * data_dout[s];
-      }
-    }
-#endif
+    DotGradFunction<DeviceContext, T>(tensor_x, tensor_y, tensor_dout,
+                                      tensor_dx, tensor_dy, ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 676361289e888a5bbd71b63ff16a1ddd7e5dad51..bce4c7ca19a603fd2eadaff7f82b5cdec91bb79f 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -55,6 +56,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
         return;
       }
 
+      bool init_generator_py = framework::Generator::GetInstance()->is_init_py;
+
       // NOTE: fixed seed should only be used in unittest or for debug.
       // Guarantee to use random seed in training.
       std::random_device rnd;
@@ -71,7 +74,11 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       std::uniform_real_distribution<float> dist(0, 1);
 
       for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < dropout_prob) {
+        float cur_random =
+            init_generator_py
+                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
+                : dist(engine);
+        if (cur_random < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 5a398fa50febe2efffd588ce8f3612f1f9cec0b6..457d9e79d7da171ef526d5cab0e59b021cb64f98 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,6 +49,8 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index 60846d1e8fee1c7f68ac101f18355750c2c15a4d..f63d6f037632c1a6a05726b933b2258adc113ee3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,5 +19,7 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 2d24e394d5c823dbd22c837210e46cefeceba1be..8afe2133c0488bbe04ec4803aac5dce6573f634d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -24,7 +25,36 @@ namespace operators {
 
 template <typename T>
 struct FloorDivFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+#ifdef __CUDA_ARCH__
+    if (b == 0) {
+      printf("Error: Divide by zero encounter in floor_divide\n");
+      asm("trap;");
+    }
+#else
+    if (b == 0)
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Divide by zero encounter in floor_divide"));
+#endif
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+template <typename T>
+struct InverseFloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+#ifdef __CUDA_ARCH__
+    if (a == 0) {
+      printf("Error: Divide by zero encounter in floor_divide\n");
+      asm("trap;");
+    }
+#else
+    if (a == 0)
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Divide by zero encounter in floor_divide"));
+#endif
+    return static_cast<T>(std::trunc(b / a));
+  }
 };
 
 template <typename DeviceContext, typename T>
@@ -32,8 +62,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index af80666b9542db1073f3b07618433671652fffa4..8c2e62bed195f27e228d5dd460ba21ed87c3f5d2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -25,14 +25,14 @@ class ElementwiseModOpMaker : public ElementwiseOpMaker {
 
   void AddInputX() override {
     AddInput("X",
-             "(Variable), Tensor or LoDTensor of any dimensions. Its dtype "
-             "should be int32, int64.");
+             "(Tensor), Tensor of any dimensions. Its dtype "
+             "should be int32, int64, float32 or float64.");
   }
 
   void AddInputY() override {
     AddInput("Y",
-             "(Variable), Tensor or LoDTensor of any dimensions. Its dtype "
-             "should be int32, int64.");
+             "(Tensor), Tensor of any dimensions. Its dtype "
+             "should be int32, int64, float32 or float64.");
   }
 
   std::string GetOpFuntionality() const override {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 4306a471b76c5bd4f0a5284052d7d39aa5fbc279..47bd6af0b95ace2b9b753e38cfc5f191bc1bb942 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -24,13 +24,19 @@ namespace operators {
 
 template <typename T>
 struct ModFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = a % b;
+    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
+    return res;
+  }
 };
 
 template <typename T>
 struct ModFunctorFP {
   inline HOSTDEVICE T operator()(T a, T b) const {
-    return fmod(b + fmod(a, b), b);
+    T res = fmod(a, b);
+    if ((res != 0) && ((b < 0) != (res < 0))) res += b;
+    return res;
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index de202ecf88cacbb5877f8aa226409b65e819d3c6..ece6af1b5a6f562bd7ff81290f98e8636feabb0c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -19,9 +19,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -82,7 +84,13 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       auto y_dims = ctx->GetInputDim("Y");
       int max_dim = std::max(x_dims.size(), y_dims.size());
       int axis = ctx->Attrs().Get<int>("axis");
-      axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+      PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim), true,
+                        platform::errors::InvalidArgument(
+                            "The axis range must be [%s, %s), but axis is %s. "
+                            "Please set the axis again.",
+                            -1 * max_dim, max_dim, axis));
+      axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                       : axis);
       std::vector<int> x_dims_array(max_dim);
       std::vector<int> y_dims_array(max_dim);
       std::vector<int> out_dims_array(max_dim);
@@ -132,8 +140,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Y.dimension must be a subsequence of x.dimension. And axis "
                  "is the start dimension index "
                  "for broadcasting Y onto X. ")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
+        .SetDefault(-1);
     AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
         .SetDefault(false);
     AddAttr<std::string>("x_data_format", "This parameter is no longer used.")
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
old mode 100644
new mode 100755
index ff55d2f2040a17c32720df08c1ac0b00cc1d7a02..a910c326196bc61758c3be7db3b8ac5d85b0095c
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -22,15 +22,20 @@ namespace operators {
 template <typename T>
 struct PowFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-#ifdef __CUDA_ARCH__
-    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
-    // it will return a float number like 2.99... , which floor to 2
-    // when cast to int by default and it is wrong.
-    // Use llrint to cast it to the nearest integer, which is 3.
+    // TODO(wujionghao): A potential speed improvement is supporting different
+    // types in C++.
+    // #ifdef __CUDA_ARCH__
+    //     // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    //     // it will return a float number like 2.99... , which floor to 2
+    //     // when cast to int by default and it is wrong.
+    //     // Use llrint to cast it to the nearest integer, which is 3.
+    //     if (std::is_integral<T>::value) {
+    //       return std::llrint(std::pow(a, b));
+    //     }
+    // #endif
     if (std::is_integral<T>::value) {
       return std::llrint(std::pow(a, b));
     }
-#endif
     return std::pow(a, b);
   }
 };
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..495b640bb4399736456bf391c6522686b9763951
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -0,0 +1,150 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/expand_as_v2_op.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandAsV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
+    OP_INOUT_CHECK(ctx->HasInput("target_tensor"), "Input", "target_tensor",
+                   "ExpandAsV2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
+    auto x_dims = ctx->GetInputDim("X");
+    auto target_tensor_dims = ctx->GetInputDim("target_tensor");
+    PADDLE_ENFORCE_GE(
+        target_tensor_dims.size(), static_cast<size_t>(x_dims.size()),
+        platform::errors::InvalidArgument(
+            "The rank of Input(target_tensor) must be greater than or equal "
+            "to the rank of Input(X). But received Input(X): input "
+            "rank %u, input shape [%s]; received Input(target_tensor): "
+            "input rank %u, input shape [%s].",
+            x_dims.size(), x_dims, target_tensor_dims.size(),
+            target_tensor_dims));
+    PADDLE_ENFORCE_LE(
+        target_tensor_dims.size(), MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of Input(target_tensor) must not be less than or equal "
+            "to %d. But received: input rank %u, input shape [%s].",
+            MAX_RANK_SUPPORTED, x_dims.size(), x_dims));
+    std::vector<int64_t> out_shape(target_tensor_dims.size());
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+  }
+};
+
+class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
+    AddInput("target_tensor", "Expand tensor's shape for each dimension.");
+    AddComment(R"DOC(
+Expand the input by given times number. You should set times
+number for each dimension by providing tensor 'expend_tensor'. The rank of X
+should be in [1, 6]. Please note that size of 'expend_tensor' must be the same
+with X's rank. Following is a using case:
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+target_tensors'shape:  [2, 6, 2]
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+)DOC");
+  }
+};
+
+class ExpandAsV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "ExpandAsV2Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class ExpandAsV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_as_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("target_tensor", this->Input("target_tensor"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
+                  ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
+                  ops::ExpandAsV2GradNoNeedBufVarsInferer);
+REGISTER_OP_CPU_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, bool>);
+REGISTER_OP_CPU_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.cu b/paddle/fluid/operators/expand_as_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e315144472dd9fd4095043e4800a3f276d9314c7
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/expand_as_v2_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4c30dfe1298d1736407c33f40d72dc690046cba
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -0,0 +1,214 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_AS_TEMPLATE(z, n, data) \
+  case n + 1: {                        \
+    ExpandAs<n + 1>(context);          \
+    break;                             \
+  }
+#define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
+#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_AS_GRAD_CASE(n)                                       \
+  case n: {                                                          \
+    ExpandAsBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                           \
+  }
+#define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
+#define REP_EXPAND_AS_GRAD_TEMPLATE(n) \
+  BOOST_PP_REPEAT(n, EXPAND_AS_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ExpandAsV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto target_rank = target_tensor->dims().size();
+    PADDLE_ENFORCE_GE(target_rank, rank,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be greater than or equal to "
+                          "the rank (%d) of the input 'x'.",
+                          target_rank, rank));
+    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
+                                   "The rank (%d) of the input 'x' for "
+                                   "expand_as_v2 op must be positive.",
+                                   rank));
+    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be less than or equal to %d.",
+                          target_rank, MAX_RANK_SUPPORTED));
+
+    switch (target_rank) { REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void ExpandAs(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto in_dims = in0->dims();
+    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto target_shape = framework::vectorize<int>(target_tensor->dims());
+    auto diff = target_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(target_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The value of target shape cannot be zero."));
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i], target_shape[i],
+            platform::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in "
+                "target tensor for expand_as_v2 op.",
+                vec_in_dims[i], target_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = target_shape[i];
+      }
+    }
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      bcast_dims[i] = repeat_times[i];
+    }
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims = framework::make_ddim(target_shape);
+
+    out0->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ExpandAsV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto x_dims = in0->dims();
+    auto target_shape = target_tensor->dims();
+    auto vec_in_dims = framework::vectorize<int>(x_dims);
+    auto diff = target_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      repeat_times[i] = target_shape[i] / vec_in_dims[i];
+    }
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      reshape_dims_vec.push_back(repeat_times[i]);
+      reshape_dims_vec.push_back(vec_in_dims[i]);
+    }
+
+    int dims = reduce_dims_vec.size();
+    bool just_copy = true;
+    for (size_t i = 0; i < repeat_times.size(); i++) {
+      if (repeat_times[i] != 1) {
+        just_copy = false;
+        break;
+      }
+    }
+    // no need reduce, just copy
+    if (just_copy) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
+    } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_v2_grad op must be greater than or "
+                            "equal to 1, but the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_v2_grad op must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
+      switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandAsBackward(const framework::ExecutionContext& context,
+                        const std::vector<int>& reshape_dims_vec,
+                        const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = reshape_dims_vec.size();
+    size_t reduce_size = reduce_dims_vec.size();
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims)
+            .sum(reduce_dims)
+            .reshape(x_grad.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..359d512c341529579a56dbe840e5eef0aa3062a5
--- /dev/null
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/expand_v2_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandV2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandV2");
+    auto x_dims = ctx->GetInputDim("X");
+    auto expand_shape = ctx->Attrs().Get<std::vector<int>>("shape");
+
+    if (expand_shape.size() == 0) {
+      expand_shape = std::vector<int>(x_dims.size(), -1);
+    }
+
+    PADDLE_ENFORCE_GE(
+        expand_shape.size(), static_cast<size_t>(x_dims.size()),
+        platform::errors::InvalidArgument(
+            "The number of elements (%d) of 'shape' for "
+            "expand_v2 op must be greater than or equal to the rank "
+            "(%d) of the input.",
+            expand_shape.size(), static_cast<size_t>(x_dims.size())));
+    PADDLE_ENFORCE_LE(expand_shape.size(), MAX_RANK_SUPPORTED,
+                      platform::errors::InvalidArgument(
+                          "The number of elements (%d) of 'shape' for "
+                          "must not be greater than %d.",
+                          expand_shape.size(), MAX_RANK_SUPPORTED));
+    PADDLE_ENFORCE_GE(expand_shape.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The number of elements (%d) of 'shape' for "
+                          "must be a positive integer.",
+                          expand_shape.size()));
+
+    auto out_rank =
+        std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
+    std::vector<int64_t> out_shape(out_rank);
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    auto diff = expand_shape.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+    for (size_t i = 0; i < expand_shape.size(); ++i) {
+      if (x_dims[i] == -1) {
+        out_shape[i] = -1;
+      } else if (expand_shape[i] == -1) {
+        out_shape[i] = x_dims[i];
+      } else {
+        PADDLE_ENFORCE_GT(
+            expand_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The %uth element of 'shape' for expand_v2 op must be "
+                "greater than 0, but the value given is %d.",
+                i, expand_shape[i]));
+        out_shape[i] = expand_shape[i];
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "expand_shapes_tensor" || var_name == "Shape") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class ExpandV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
+    AddInput("Shape",
+             "(Tensor<int>), optional). If provided, expand according to "
+             "this given Shape. It has a higher priority than "
+             "expand_shapes_tensor and the shape attribute.")
+        .AsDispensable();
+    AddInput("expand_shapes_tensor",
+             "(Tensor Tensor<int>), epxanded shape for X."
+             "It has a higher priority than shape attribute, but a lower "
+             "priority than the input Shape")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
+    AddAttr<std::vector<int>>("shape", "The expanded shape for each dimension.")
+        .SetDefault({});
+    AddComment(R"DOC(
+Expand the input to the given shape. The rank of X
+should be in [1, 6] and size of 'shape' must be in [1, 6] also.
+Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(shape):  [2, 6, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandV2Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "ExpandV2Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    if (expand_shape.size() == 0) {
+      expand_shape = std::vector<int>(x_dims.size(), -1);
+    }
+
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    auto diff = expand_shape.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+
+    for (size_t i = 0; i < expand_shape.size(); ++i) {
+      if (expand_shape[i] == -1 || x_dim_vec[i] == -1) {
+        continue;
+      } else {
+        if (ctx->IsRuntime()) {
+          PADDLE_ENFORCE_EQ(
+              expand_shape[i], out_dims[i],
+              platform::errors::InvalidArgument(
+                  "The size (%d) of the dimension %d of Input(Out@GRAD) should "
+                  "be equal to the crroresponding dimension size of shape(%d).",
+                  out_dims[i], i, expand_shape[i]));
+        }
+      }
+    }
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "expand_shapes_tensor" || var_name == "Shape") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
+    op->SetInput("Shape", this->Input("Shape"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
+                  ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
+                  ops::ExpandV2GradNoNeedBufVarsInferer);
+REGISTER_OP_CPU_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, bool>);
+REGISTER_OP_CPU_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_v2_op.cu b/paddle/fluid/operators/expand_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e096dbc27f0c2ae8142da40b9db99074b2719387
--- /dev/null
+++ b/paddle/fluid/operators/expand_v2_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/expand_v2_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec9c6e62f272ed87abc4e0be6ccf1de3aedf15d4
--- /dev/null
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -0,0 +1,296 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+inline std::vector<int> get_expand_shape(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("Shape")) {
+    auto* shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+    auto* shape_data = shape_tensor->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    if (platform::is_gpu_place(shape_tensor->place())) {
+      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      shape_data = cpu_shape_tensor.data<int>();
+    }
+    auto vec_shape =
+        std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+    return vec_shape;
+  }
+
+  auto list_expand_shapes_tensor =
+      ctx.MultiInput<framework::Tensor>("expand_shapes_tensor");
+  if (list_expand_shapes_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_epxand_shape;
+    for (size_t i = 0; i < list_expand_shapes_tensor.size(); ++i) {
+      auto tensor = list_expand_shapes_tensor[i];
+      if (platform::is_gpu_place(tensor->place())) {
+        framework::Tensor temp;
+        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_epxand_shape.push_back(*temp.data<int32_t>());
+      } else {
+        vec_epxand_shape.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_epxand_shape;
+  } else {
+    return ctx.Attr<std::vector<int>>("shape");
+  }
+}
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::To32BitIndex;
+
+template <typename DeviceContext, typename T>
+class ExpandV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'X' for expand_v2 op must be positive, "
+            "but the value received is %d.",
+            rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'X' for expand_v2 op must be less than "
+            "or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto expand_shape = get_expand_shape(context);
+    auto shape_size = expand_shape.size();
+    PADDLE_ENFORCE_GE(
+        shape_size, rank,
+        platform::errors::InvalidArgument(
+            "The number (%d) of elements of 'shape' for expand_v2 op must be "
+            "greater than or equal to the rank (%d) of the input 'X'.",
+            shape_size, rank));
+    PADDLE_ENFORCE_LE(
+        shape_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number (%d) of elements of 'shape' for expand_v2 op must be "
+            "less than or equal to %d.",
+            shape_size, MAX_RANK_SUPPORTED));
+    rank = std::max(rank, static_cast<int>(shape_size));
+    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+
+    auto in_dims = in0->dims();
+    auto expand_shape = get_expand_shape(context);
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto diff = expand_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(expand_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The expanded size cannot be zero."));
+      if (i < diff) {
+        PADDLE_ENFORCE_GT(
+            expand_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The expanded size (%d) for non-existing dimensions must be "
+                "positive for expand_v2 op.",
+                expand_shape[i]));
+        repeat_times[i] = expand_shape[i];
+      } else if (expand_shape[i] > 0) {
+        if (vec_in_dims[i] != 1) {
+          PADDLE_ENFORCE_EQ(
+              vec_in_dims[i], expand_shape[i],
+              platform::errors::InvalidArgument(
+                  "The value (%d) of the non-singleton dimension does not match"
+                  " the corresponding value (%d) in shape for expand_v2 op.",
+                  vec_in_dims[i], expand_shape[i]));
+          repeat_times[i] = 1;
+        } else {
+          repeat_times[i] = expand_shape[i];
+        }
+      } else {
+        PADDLE_ENFORCE_EQ(
+            expand_shape[i], -1,
+            platform::errors::InvalidArgument(
+                "When the value in shape is negative for expand_v2 op, "
+                "only -1 is supported, but the value received is %d.",
+                expand_shape[i]));
+        repeat_times[i] = 1;
+      }
+    }
+
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      bcast_dims[i] = repeat_times[i];
+    }
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+
+    out0->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    // use 32-bit index to speed up
+    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
+    if (use_32bit_index) {
+      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+    } else {
+      y.device(place) = x.broadcast(bcast_dims);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ExpandV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto expand_shape = get_expand_shape(context);
+    auto x_dims = in0->dims();
+    auto vec_in_dims = framework::vectorize<int>(x_dims);
+    auto diff = expand_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    // 1. reshape_dims_vec is the broadcast parameter.
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      if (expand_shape[i] < 0) {
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = expand_shape[i] / vec_in_dims[i];
+      }
+    }
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      reshape_dims_vec.push_back(repeat_times[i]);
+      reshape_dims_vec.push_back(vec_in_dims[i]);
+    }
+
+    int dims = reduce_dims_vec.size();
+
+    bool just_copy = true;
+    for (size_t i = 0; i < repeat_times.size(); i++) {
+      if (repeat_times[i] != 1) {
+        just_copy = false;
+        break;
+      }
+    }
+    // no need reduce, just copy
+    if (just_copy) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
+    } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_v2_grad op must be greater than or "
+                            "equal to 1, but the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_v2_grad op must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
+      switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = reshape_dims_vec.size();
+    size_t reduce_size = reduce_dims_vec.size();
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims)
+            .sum(reduce_dims)
+            .reshape(x_grad.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 2cf08e5c3409acb6b8a43058e143df55fd563d74..793519b40182114c13e63dd32caaa382d55fa52d 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
@@ -93,4 +92,4 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
                        ops::EyeKernel<CPU, double>,
                        ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
+                       ops::EyeKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 0d2b951ee1c544151e99af8216db7809e2a77852..9b0328b0945ba9b57cb9ab27233656e3b0af4f5f 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -37,20 +37,49 @@ template <typename T>
 struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& dev_ctx,
                   const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, framework::Tensor* out) {
+                  const int scale_num, T max_range, const int quant_axis,
+                  framework::Tensor* out) {
     if (scale_num == 1) {
-      const int channel = in->dims()[0];
+      // Dequant op is before quantized op
+      // Dequantize the weight of quantized op
+      auto in_dims = in->dims();
+      const int64_t channel = in_dims[quant_axis];
       const T* scale_factor = scales[0]->data<T>();
-      for (int i = 0; i < channel; i++) {
-        T s = scale_factor[i];
-        framework::Tensor one_channel_in = in->Slice(i, i + 1);
-        framework::Tensor one_channel_out = out->Slice(i, i + 1);
-        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
-        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        auto& dev = *dev_ctx.eigen_device();
-        out_e.device(dev) = in_e * s / max_range;
+      if (quant_axis == 0) {
+        for (int64_t i = 0; i < channel; i++) {
+          T s = scale_factor[i];
+          framework::Tensor one_channel_in = in->Slice(i, i + 1);
+          framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = in_e * s / max_range;
+        }
+      } else if (quant_axis == 1) {
+        int64_t out_iter = 1;
+        for (int i = 0; i < quant_axis; i++) {
+          out_iter *= in_dims[i];
+        }
+        int64_t step_i = in->numel() / out_iter;
+        int64_t step_j = in->numel() / (out_iter * channel);
+        auto* in_data = in->data<T>();
+        auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        for (int64_t i = 0; i < out_iter; i++) {
+          for (int64_t j = 0; j < channel; j++) {
+            auto* cur_in = in_data + i * step_i + j * step_j;
+            auto* cur_out = out_data + i * step_i + j * step_j;
+            T s = scale_factor[j];
+            for (int64_t k = 0; k < step_j; k++) {
+              *cur_out = (*cur_in) * s / max_range;
+              ++cur_in;
+              ++cur_out;
+            }
+          }
+        }
       }
     } else if (scale_num == 2) {
+      // Dequant op is after quantized op
+      // Dequantize the output tensor of quantized op
       int batch_size = in->dims()[0];
       int channel = in->dims()[1];
       const T* scale_one = scales[0]->data<T>();
@@ -157,6 +186,18 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker
         "Quantization bit numbers in quantization stage. "
         "The size of `quant_bits` should be equal to the size of `Scales`.")
         .SetDefault({8});
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
 
     AddComment(R"DOC(
 FakeChannelWiseDequantizeMaxAbsOp operator.
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 02f9dc827d68cbb58447ed1557ff4bf310b2c017..54a92b055a39d49ea061250b066957f933fb975e 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -45,8 +45,9 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
-                                   int num, int channel, T* out) {
+__global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
+                                             T max_range, int num, int channel,
+                                             T* out) {
   int tid = threadIdx.x;
   int channel_size = num / channel;
   const T* in_c = in + blockIdx.x * channel_size;
@@ -56,6 +57,23 @@ __global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
   }
 }
 
+template <typename T>
+__global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
+                                             T max_range, const int num,
+                                             const int cin, const int cout,
+                                             T* out) {
+  int cout_wh_size = num / cin;
+  int wh_size = cout_wh_size / cout;
+
+  T s = scale[blockIdx.x];
+  const T* in_current = in + threadIdx.x * cout_wh_size + blockIdx.x * wh_size;
+  T* out_current = out + threadIdx.x * cout_wh_size + blockIdx.x * wh_size;
+
+  for (int i = 0; i < wh_size; i++) {
+    out_current[i] = in_current[i] * s / max_range;
+  }
+}
+
 template <typename T>
 __global__ void DequantizeTwoScale(const T* in, const T* scale_one,
                                    const T* scale_two, T max_range, int num,
@@ -74,18 +92,29 @@ template <typename T>
 struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& dev_ctx,
                   const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, framework::Tensor* out) {
+                  const int scale_num, T max_range, const int quant_axis,
+                  framework::Tensor* out) {
+    auto in_dims = in->dims();
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
     if (scale_num == 1) {
       int num = in->numel();
-      int channel = in->dims()[0];
       const T* scale_factor = scales[0]->data<T>();
-      int block = 1024;
-      int grid = channel;
-      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          in_data, scale_factor, max_range, num, channel, out_data);
+      if (quant_axis == 0) {
+        int grid = in_dims[0];
+        int block = 1024;
+        DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[0], out_data);
+      } else if (quant_axis == 1) {
+        // Dequantize weight of Cin * Cout * W * H
+        int grid = in_dims[1];
+        int block = in_dims[0];
+        DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
+            out_data);
+      }
     } else if (scale_num == 2) {
+      // Not need to consider quant_axis
       int num = in->numel();
       int batch_size = in->dims()[0];
       int channel = in->dims()[1];
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index 500960098f5ce5e66af5690138c15cc0eaa80d83..6ddb12771fd5176dbe27642adcb2ac82e4d7bfbf 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -33,7 +33,7 @@ template <typename DeviceContext, typename T>
 struct ChannelDequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
                   const framework::Tensor** scales, const int scale_num,
-                  T max_range, framework::Tensor* out);
+                  T max_range, const int quant_axis, framework::Tensor* out);
 };
 
 template <typename DeviceContext, typename T>
@@ -63,6 +63,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::Tensor>("Out");
 
     auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
+    auto quant_axis = ctx.Attr<int>("quant_axis");
     int max_range = 1;
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -70,12 +71,12 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     int scale_num = scales.size();
     if (scale_num == 1) {
       PADDLE_ENFORCE_EQ(
-          scales[0]->numel(), in->dims()[0],
+          scales[0]->numel(), in->dims()[quant_axis],
           platform::errors::PreconditionNotMet(
               "The number of first scale values must be the same with "
-              "first dimension value of Input(X) when the `Scales` has only "
-              "one element, but %ld != %ld here.",
-              scales[0]->numel(), in->dims()[0]));
+              "quant_axis dimension value of Input(X) when the `Scales` has "
+              "only one element, but %ld != %ld here.",
+              scales[0]->numel(), in->dims()[quant_axis]));
       max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
     } else if (scale_num == 2) {
       PADDLE_ENFORCE_EQ(
@@ -94,7 +95,8 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
                    (std::pow(2, quant_bits[1] - 1) - 1);
     }
     ChannelDequantizeFunctor<DeviceContext, T>()(
-        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range),
+        quant_axis, out);
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 358f122c8359fa60f2c99492db8851c8a5fc5293..04ac4a35208a54361a4f434e68095e9519ee12e9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fake_quantize_op.h"
+#include <algorithm>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/clip_op.h"
@@ -39,13 +40,41 @@ template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
-                  const int num, const int channel, T* out) {
-    const int channel_size = num / channel;
-    for (int i = 0; i < channel; i++) {
-      auto* start = in + i * channel_size;
-      auto* end = in + (i + 1) * channel_size;
-      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in_tensor, const int quant_axis,
+                  T* out_abs_max) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    auto* in_data = in_tensor.data<T>();
+    auto in_dims = in_tensor.dims();
+    const int64_t channel = in_dims[quant_axis];
+    if (quant_axis == 0) {
+      const int64_t channel_size = in_tensor.numel() / channel;
+      for (int64_t i = 0; i < channel; i++) {
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        out_abs_max[i] =
+            std::abs(*(std::max_element(start, end, Compare<T>())));
+      }
+    } else if (quant_axis == 1) {
+      for (int64_t i = 0; i < channel; i++) {
+        out_abs_max[i] = 0;
+      }
+      const int64_t step_i = in_tensor.numel() / in_dims[0];
+      const int64_t step_j = in_tensor.numel() / (in_dims[0] * in_dims[1]);
+      for (int64_t i = 0; i < in_dims[0]; i++) {
+        for (int64_t j = 0; j < in_dims[1]; j++) {
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          T abs_max = std::abs(*(std::max_element(start, end, Compare<T>())));
+          out_abs_max[j] = std::max(out_abs_max[j], abs_max);
+        }
+      }
     }
   }
 };
@@ -92,26 +121,53 @@ template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
                   const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int channel,
+                  const int bin_cnt, const int quant_axis,
                   framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
     auto* scale_data = scale.data<T>();
     auto* in_data = in.data<T>();
     auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    const int channel_size = in.numel() / channel;
+    auto in_dims = in.dims();
+    const int64_t channel = in_dims[quant_axis];
     platform::Transform<platform::CPUDeviceContext> trans;
-    for (int i = 0; i < channel; i++) {
-      T s = scale_data[i];
-      auto* start = in_data + i * channel_size;
-      auto* end = in_data + (i + 1) * channel_size;
-      trans(ctx, start, end, out_data + i * channel_size,
-            ClipFunctor<T>(-s, s));
-    }
-    for (int i = 0; i < channel; i++) {
-      T s = scale_data[i];
-      T inv_s = inverse(s);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-      out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+    if (quant_axis == 0) {
+      const int64_t channel_size = in.numel() / channel;
+      for (int64_t i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        trans(ctx, start, end, out_data + i * channel_size,
+              ClipFunctor<T>(-s, s));
+      }
+      for (int64_t i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        T inv_s = inverse(s);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+      }
+    } else if (quant_axis == 1) {
+      const int64_t step_i = in.numel() / in_dims[0];
+      const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
+      for (int i = 0; i < in_dims[0]; i++) {
+        for (int j = 0; j < in_dims[1]; j++) {
+          T s = scale_data[j];
+          T inv_s = inverse(s);
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto* cur_out_data = out_data + i * step_i + j * step_j;
+          trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
+          for (int k = 0; k < step_j; k++) {
+            cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]);
+          }
+        }
+      }
     }
   }
 };
@@ -247,8 +303,9 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
                    "FakeChannelWiseQuantizeAbsMax");
     OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
                    "FakeChannelWiseQuantizeAbsMax");
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -269,6 +326,18 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
               "(Tensor) Output of quantized low level tensor, "
               "but also saved as float data type.");
     AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
         .AddCustomChecker([](const int& bit_length) {
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 75a55fa821f0af664ad18cc20c90cd2f3d61d5d0..6ff3c7ec632f236fe4ae6c6504537df3b8a46b7a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -75,8 +75,8 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
-__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
-                                        T* out) {
+__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
+                                                  const int c, T* out) {
   int tid = threadIdx.x;
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
@@ -100,14 +100,69 @@ __global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
   }
 }
 
+template <typename T>
+__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
+                                                  const int cin, const int cout,
+                                                  T* out) {
+  extern __shared__ T shared_max_data[];
+  int cout_wh_size = n / cin;
+  int wh_size = n / (cin * cout);
+
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  const T* in_current = in + tid * cout_wh_size + bid * wh_size;
+  shared_max_data[tid] = T(0);
+  for (int i = 0; i < wh_size; i++) {
+    T tmp = fabs(in_current[i]);
+    if (tmp > shared_max_data[tid]) {
+      shared_max_data[tid] = tmp;
+    }
+  }
+  __syncthreads();
+
+  int len = blockDim.x;
+  for (int i = (len + 1) / 2; i > 0; len = i, i = (i + 1) / 2) {
+    if (tid < i && tid + i < len &&
+        shared_max_data[tid] < shared_max_data[tid + i]) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    if (i == 1) {
+      i = 0;  // break the loop
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[bid] = shared_max_data[0];
+  }
+}
+
 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, const int channel, T* out) {
-    int block = 1024;
-    int grid = channel;
-    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, channel, out);
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in_tensor, const int quant_axis,
+                  T* out_abs_max) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    const int num = in_tensor.numel();
+    auto in_dims = in_tensor.dims();
+    int channel = in_dims[quant_axis];
+    const T* in_data = in_tensor.data<T>();
+    if (quant_axis == 0) {
+      int grid = channel;
+      int block = 1024;
+      FindChannelAbsMaxKernelQuantAxis0<
+          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+          in_data, num, channel, out_abs_max);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[1];
+      int block = in_dims[0];
+      FindChannelAbsMaxKernelQuantAxis1<
+          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+          in_data, num, in_dims[0], in_dims[1], out_abs_max);
+    }
   }
 };
 
@@ -189,10 +244,12 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
 template struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext,
                                                float>;
 
+// ChannelClipAndQuantKernel for quant_axis is 0
 template <typename T>
-__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
-                                          const int bin_cnt, const int n,
-                                          const int c, T* out) {
+__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
+                                                    const int bin_cnt,
+                                                    const int n, const int c,
+                                                    T* out) {
   int tid = threadIdx.x;
 
   int channel_size = n / c;
@@ -211,22 +268,57 @@ __global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
   }
 }
 
+// ChannelClipAndQuantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantKernelQuantAxis1(const T* in, const T* scale,
+                                                    const int bin_cnt,
+                                                    const int n, const int cin,
+                                                    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v);
+  }
+}
+
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& ctx,
                   const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int channel,
+                  const int bin_cnt, const int quant_axis,
                   framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = channel;
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
 
+    int num = in.numel();
+    auto in_dims = in.dims();
     const T* in_data = in.data<T>();
     const T* scale_data = scale.data<T>();
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
 
-    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, channel, out_data);
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+      ChannelClipAndQuantKernelQuantAxis1<T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 4136217fb0c5f600971c1c04f803b65de9bbecb4..5c6e0b1f6e26d84462a18da910b412f03b93285d 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -61,15 +61,15 @@ struct FindRangeAbsMaxFunctor {
 
 template <typename DeviceContext, typename T>
 struct FindChannelAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const T* in, const int num,
-                  const int channel, T* out);
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in_tensor,
+                  const int quant_axis, T* out_abs_max);
 };
 
 template <typename DeviceContext, typename T>
 struct ChannelClipAndFakeQuantFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& in,
                   const framework::Tensor& scale, const int bin_cnt,
-                  const int channel, framework::Tensor* out);
+                  const int quant_axis, framework::Tensor* out);
 };
 
 template <typename DeviceContext, typename T>
@@ -144,12 +144,13 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                 out_scale_data);
     ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
-        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
+        dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index e8f371cb4877f343d108e8528345be03cd9b354b..b22f28fbbe3ce8ce178a3d9c17a048817cb750e7 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -216,6 +216,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
       VLOG(3) << "cuDNN forward algo " << algo;
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+              cudnn_output_desc, algo, &workspace_size_in_bytes));
+      if (workspace_size_in_bytes > workspace_size_limit)
+        workspace_size_limit = workspace_size_in_bytes;
     } else {
       std::function<cudnnConvolutionFwdAlgo_t()> search_func =
           [&]() -> cudnnConvolutionFwdAlgo_t {
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index c9e8af6153b672b821355096078e1d186508034c..738e069081511ed2e6df56633971f0db21211ac1 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -22,8 +22,14 @@ class FusionGroupOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    const size_t num_ins = ctx->Inputs("Inputs").size();
-    const size_t num_outs = ctx->Outputs("Outs").size();
+    OP_INOUT_CHECK(ctx->HasInputs("Inputs"), "Input", "Inputs", "FusionGroup");
+    OP_INOUT_CHECK(ctx->HasOutputs("Outs"), "Output", "Outs", "FusionGroup");
+
+    auto input_names = ctx->Inputs("Inputs");
+    auto output_names = ctx->Outputs("Outs");
+
+    const size_t num_ins = input_names.size();
+    const size_t num_outs = output_names.size();
 
     PADDLE_ENFORCE_GE(
         num_ins, 1UL,
@@ -42,9 +48,12 @@ class FusionGroupOp : public framework::OperatorWithKernel {
     std::vector<framework::DDim> x_dims = ctx->GetInputsDim("Inputs");
     if (type == 0) {
       for (size_t i = 1; i < num_ins; ++i) {
-        PADDLE_ENFORCE_EQ(x_dims[0], x_dims[i],
-                          platform::errors::InvalidArgument(
-                              "All the inputs' dims should be the same."));
+        PADDLE_ENFORCE_EQ(
+            x_dims[0], x_dims[i],
+            platform::errors::InvalidArgument(
+                "All the inputs' dims is expected to be the same. "
+                "But recieved [%s] (name: %s) vs [%s] (name: %s).",
+                x_dims[0], input_names[0], x_dims[i], input_names[i]));
       }
       std::vector<framework::DDim> out_dims;
       for (size_t j = 0; j < num_outs; ++j) {
@@ -76,11 +85,11 @@ class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Outs",
               "(std::vector<LoDTensor>) The outputs of fusion_group op.")
         .AsDuplicable();
-    AddAttr<std::vector<std::string>>(
-        "outs_data_type", "The data type of Outputs in fusion_group op.")
+    AddAttr<std::vector<int>>("outs_dtype",
+                              "The data type of Outputs in fusion_group op.")
         .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "inputs_data_type", "The data type of Inputs in fusion_group op.")
+    AddAttr<std::vector<int>>("inputs_dtype",
+                              "The data type of Inputs in fusion_group op.")
         .SetDefault({});
     AddAttr<int>("type", "Fusion type.").SetDefault(0);
     AddAttr<std::string>("func_name", "Name of the generated functions.")
diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h
index 8449c6b63b1a176071c2197de063b90ec2a535eb..5e5f2c60ffbd48d801aa4cff1b074170c44ed88a 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.h
+++ b/paddle/fluid/operators/fused/fusion_group_op.h
@@ -24,14 +24,14 @@ namespace operators {
 
 static void MutableMultiTypeData(
     std::vector<paddle::framework::LoDTensor*>* var,
-    const std::vector<std::string>& data_type, const platform::Place& place) {
+    const std::vector<int>& data_type, const platform::Place& place) {
   for (size_t i = 0; i < var->size(); i++) {
-    if (data_type[i] == "float") {
+    if (data_type[i] == framework::proto::VarType::FP32) {
       (*var)[i]->mutable_data<float>(place);
-    } else if (data_type[i] == "double") {
-      (*var)[i]->mutable_data<double>(place);
-    } else if (data_type[i] == "::paddle::platform::float16") {
+    } else if (data_type[i] == framework::proto::VarType::FP16) {
       (*var)[i]->mutable_data<paddle::platform::float16>(place);
+    } else if (data_type[i] == framework::proto::VarType::FP64) {
+      (*var)[i]->mutable_data<double>(place);
     }
   }
 }
@@ -43,15 +43,15 @@ class FusionGroupKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<framework::LoDTensor>("Inputs");
     auto outs = ctx.MultiOutput<framework::LoDTensor>("Outs");
     int type = ctx.Attr<int>("type");
-    auto outs_type = ctx.Attr<std::vector<std::string>>("outs_data_type");
-    auto inputs_type = ctx.Attr<std::vector<std::string>>("inputs_data_type");
+    const auto& outs_dtype = ctx.Attr<std::vector<int>>("outs_dtype");
+    const auto& inputs_dtype = ctx.Attr<std::vector<int>>("inputs_dtype");
 
     size_t num_ins = ins.size();
     size_t num_outs = outs.size();
 
     auto place = ctx.GetPlace();
 
-    MutableMultiTypeData(&outs, outs_type, place);
+    MutableMultiTypeData(&outs, outs_dtype, place);
 
     std::string func_name = ctx.Attr<std::string>("func_name");
     platform::DeviceCode* dev_code =
@@ -64,22 +64,22 @@ class FusionGroupKernel : public framework::OpKernel<T> {
       args.push_back(&n);
       std::vector<const void*> ptrs(num_ins + num_outs);
       for (size_t i = 0; i < num_ins; ++i) {
-        if (inputs_type[i] == "::paddle::platform::float16") {
+        if (inputs_dtype[i] == framework::proto::VarType::FP16) {
           ptrs[i] = ins[i]->data<paddle::platform::float16>();
-        } else if (inputs_type[i] == "double") {
-          ptrs[i] = ins[i]->data<double>();
-        } else if (inputs_type[i] == "float") {
+        } else if (inputs_dtype[i] == framework::proto::VarType::FP32) {
           ptrs[i] = ins[i]->data<float>();
+        } else if (inputs_dtype[i] == framework::proto::VarType::FP64) {
+          ptrs[i] = ins[i]->data<double>();
         }
         args.push_back(&ptrs[i]);
       }
       for (size_t j = 0; j < num_outs; ++j) {
-        if (outs_type[j] == "::paddle::platform::float16") {
+        if (outs_dtype[j] == framework::proto::VarType::FP16) {
           ptrs[num_ins + j] = outs[j]->data<paddle::platform::float16>();
-        } else if (outs_type[j] == "double") {
-          ptrs[num_ins + j] = outs[j]->data<double>();
-        } else if (outs_type[j] == "float") {
+        } else if (outs_dtype[j] == framework::proto::VarType::FP32) {
           ptrs[num_ins + j] = outs[j]->data<float>();
+        } else if (outs_dtype[j] == framework::proto::VarType::FP64) {
+          ptrs[num_ins + j] = outs[j]->data<double>();
         }
         args.push_back(&ptrs[num_ins + j]);
       }
diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc
index 48e7d6af397849491c1afeb65c02878b88ccd6cf..d50c829b475752cfad5a41500c6d66d1ecc4c8bf 100644
--- a/paddle/fluid/operators/fused/fusion_group_op_test.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc
@@ -57,10 +57,14 @@ framework::OpDesc* CreateFusionGroupOp(
     const std::vector<std::string>& input_names,
     const std::vector<std::vector<int64_t>>& input_shapes,
     const std::vector<std::string>& output_names, int type,
-    const std::vector<std::string>& inputs_data_type,
-    const std::vector<std::string>& outs_data_type, std::string func_name) {
+    std::string func_name) {
   EXPECT_EQ(input_names.size(), input_shapes.size());
 
+  std::vector<int> input_dtypes(input_names.size(),
+                                framework::proto::VarType::FP32);
+  std::vector<int> output_dtypes(output_names.size(),
+                                 framework::proto::VarType::FP32);
+
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto* var = program->MutableBlock(0)->Var(input_names[i]);
     var->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -77,8 +81,8 @@ framework::OpDesc* CreateFusionGroupOp(
   op->SetType("fusion_group");
   op->SetInput("Inputs", input_names);
   op->SetOutput("Outs", output_names);
-  op->SetAttr("inputs_data_type", inputs_data_type);
-  op->SetAttr("outs_data_type", outs_data_type);
+  op->SetAttr("inputs_dtype", input_dtypes);
+  op->SetAttr("outs_dtype", output_dtypes);
   op->SetAttr("type", type);
   op->SetAttr("func_name", func_name);
   op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -133,8 +137,6 @@ void CheckOutputs(framework::Scope* scope,
 void TestMain(const std::vector<std::string>& input_names,
               const std::vector<std::vector<int64_t>>& input_shapes,
               const std::vector<std::string>& output_names, int type,
-              const std::vector<std::string>& inputs_data_type,
-              const std::vector<std::string>& outs_data_type,
               std::string func_name, std::string cuda_kernel_str,
               CPUKernelFunc cpu_kernel_func) {
   // Compile the device code
@@ -144,9 +146,8 @@ void TestMain(const std::vector<std::string>& input_names,
 
   // Create a ProgramDesc that has a fusion_group_op.
   framework::ProgramDesc program;
-  framework::OpDesc* op_desc =
-      CreateFusionGroupOp(&program, input_names, input_shapes, output_names,
-                          type, inputs_data_type, outs_data_type, func_name);
+  framework::OpDesc* op_desc = CreateFusionGroupOp(
+      &program, input_names, input_shapes, output_names, type, func_name);
   auto fusion_group_op = framework::OpRegistry::CreateOp(*op_desc);
 
   framework::Scope scope;
@@ -216,11 +217,8 @@ void elementwise_cuda_kernel_0(size_t n, float *x, float* y, float* z) {
     }
   };
 
-  std::vector<std::string> inputs_data_type(input_names.size(), "float");
-  std::vector<std::string> outs_data_type(output_names.size(), "float");
-  TestMain(input_names, input_shapes, output_names, 0, inputs_data_type,
-           outs_data_type, "elementwise_cuda_kernel_0", kernel,
-           elementwise_cpu_kernel_0);
+  TestMain(input_names, input_shapes, output_names, 0,
+           "elementwise_cuda_kernel_0", kernel, elementwise_cpu_kernel_0);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index f59d46ec79bd0960392ed1b8b3c8ee27b2317e39..c4bdd9e439c54db03f8fa8c4fe439ed6edbd0c7a 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -158,5 +159,133 @@ void GPUGatherNd(const framework::ExecutionContext& context,
       end_size);
 }
 
+template <typename T, typename U>
+__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
+                                int outer_dim_size, int inner_dim_size,
+                                int out_index_dim_size,
+                                int input_index_dim_size, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    int inner_dim_index = idx / (outer_dim_size * out_index_dim_size);
+    int next_idx = idx % (outer_dim_size * out_index_dim_size);
+    int index_dim_index = next_idx / (outer_dim_size);
+    int out_dim_index = next_idx % outer_dim_size;
+    int input_index =
+        inner_dim_index * (outer_dim_size * input_index_dim_size) +
+        index[index_dim_index] * outer_dim_size + out_dim_index;
+    out[idx] = input[input_index];
+  }
+}
+
+template <typename T, typename U>
+__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+                                    int outer_dim_size, int inner_dim_size,
+                                    int input_index_dim_size,
+                                    int out_index_dim_size, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    int inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
+    int next_idx = idx % (outer_dim_size * input_index_dim_size);
+    int index_dim_index = next_idx / (outer_dim_size);
+    int out_dim_index = next_idx % outer_dim_size;
+    int out_index = inner_dim_index * (outer_dim_size * out_index_dim_size) +
+                    index[index_dim_index] * outer_dim_size + out_dim_index;
+    paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
+  }
+}
+
+template <typename T, typename U, typename V>
+void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
+                          const Tensor* axis, Tensor* out,
+                          const paddle::platform::Place& place,
+                          const framework::ExecutionContext& ctx) {
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+  auto* index_data = index->data<U>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  Tensor cpu_axis;
+  framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis);
+  int axis_index = cpu_axis.data<V>()[0];
+  int index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+  std::vector<int> out_dim_vec;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  out_dim_vec.push_back(index_size);
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  auto out_dim = framework::make_ddim(out_dim_vec);
+
+  out->Resize(out_dim);
+  auto* out_data = out->mutable_data<T>(place);
+  int out_size = out->numel();
+
+  int threads = 512;
+  int grid = (out_size + threads - 1) / threads;
+  auto stream = ctx.cuda_device_context().stream();
+  GatherGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      index_size, index_dim_size, out_size);
+}
+
+template <typename T, typename U, typename V>
+void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
+                              const Tensor* axis, Tensor* out,
+                              const paddle::platform::Place& place,
+                              const framework::ExecutionContext& ctx) {
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  Tensor cpu_axis;
+  framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis);
+  int axis_index = cpu_axis.data<V>()[0];
+  int input_index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+  }
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+  }
+
+  auto* out_data = out->mutable_data<T>(place);
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto out_dim = out->dims();
+  int out_index_dim_size = out_dim[axis_index];
+  operators::math::set_constant(*dev_ctx, out, 0.0);
+
+  int threads = 512;
+  int grid = (input_size + threads - 1) / threads;
+  auto stream = ctx.cuda_device_context().stream();
+  GatherGradGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      input_index_dim_size, out_index_dim_size, input_size);
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index f5a7bffe4745360a307a4b7c61b30c871cf6c756..c12a3b8adc97893f523b307a56c0e6b04ea8d675 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once
 #include <memory.h>
 #include <cstring>
+#include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -124,5 +126,110 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
   }
 }
 
+template <typename T, typename U, typename V>
+void GatherV2Function(const Tensor* input, const Tensor* index,
+                      const Tensor* axis, Tensor* out,
+                      const paddle::platform::Place& place) {
+  auto* axis_data = axis->data<V>();
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  int axis_index = axis_data[0];
+
+  int input_index_dim_size = input_dim[axis_index];
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
+                      platform::errors::InvalidArgument(
+                          "The element of Index must be less than the size of "
+                          "input dim size of axis which is %d, but received "
+                          "index element which is %d in the %d index.",
+                          input_index_dim_size, index_data[i], i));
+  }
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+  std::vector<int> out_dim_vec;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  out_dim_vec.push_back(index_size);
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  auto out_dim = framework::make_ddim(out_dim_vec);
+
+  out->Resize(out_dim);
+  auto* out_data = out->mutable_data<T>(place);
+
+  int out_index = 0;
+  for (int i = 0; i < inner_dim_size; i++) {
+    for (int j = 0; j < index_size; j++) {
+      for (int k = 0; k < outer_dim_size; k++) {
+        int index = k + index_data[j] * outer_dim_size +
+                    (i * input_size / inner_dim_size);
+        out_data[out_index] = input_data[index];
+        out_index++;
+      }
+    }
+  }
+}
+
+template <typename T, typename U, typename V>
+void GatherV2GradFunction(const Tensor* input, const Tensor* index,
+                          const Tensor* axis, Tensor* out,
+                          const paddle::platform::Place& place) {
+  auto* axis_data = axis->data<V>();
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  int axis_index = axis_data[0];
+  int input_index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+  }
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+  }
+
+  auto* out_data = out->mutable_data<T>(place);
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto out_dim = out->dims();
+  int out_index_dim_size = out_dim[axis_index];
+  operators::math::set_constant(*dev_ctx, out, 0.0);
+
+  for (int i = 0; i < inner_dim_size; i++) {
+    for (int j = 0; j < input_index_dim_size; j++) {
+      for (int k = 0; k < outer_dim_size; k++) {
+        int index = k + index_data[j] * outer_dim_size +
+                    i * outer_dim_size * out_index_dim_size;
+        out_data[index] += input_data[j * outer_dim_size + k];
+      }
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index c22c8a18ca63a05265ac6991cf0e0cbd9e7ea5ed..1427bd04d3442be26be931ca31bf358ebd23efae 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -45,7 +45,7 @@ class GatherNdOp : public framework::OperatorWithKernel {
         index_dims[index_dims_size - 1], x_dims_size,
         platform::errors::InvalidArgument(
             "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
+    PADDLE_ENFORCE_GE(index_dims_size, 1UL,
                       platform::errors::InvalidArgument(
                           "The rank of Input(Index) should be greater than 1"));
 
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6a3abaa600281ac4a9762d5c73d398974abbf041..28afeb6f541c68fe7e0719a782fd8c9147b15163 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
-
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
 
@@ -78,6 +78,9 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
+    AddInput("Axis",
+             "The Tensor which contains the axis that we do gather operation.")
+        .AsDispensable();
     AddOutput("Out", "The output of gather op");
     AddAttr<bool>(
         "overwrite",
@@ -120,6 +123,8 @@ class GatherGradOpMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("gather_grad");
     op->SetInput("Index", this->Input("Index"));
+    op->SetInput("Axis", this->Input("Axis"));
+
     op->SetInput("X", this->Input("X"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -147,3 +152,7 @@ REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<int>,
                        ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
+REGISTER_OP_VERSION(gather)
+    .AddCheckpoint(R"ROC(upgrad gather, add attribut [axis])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "axis", "Specify the axis of gather operation.", {}));
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 5bef547c0542b922f646f72ffb7310ef4eb279e9..37fbfb21f60a0568390c6798dc305c91fc8af886 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -31,6 +31,33 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2CUDAFunction<T, int32_t, int32_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2CUDAFunction<T, int32_t, int64_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2CUDAFunction<T, int64_t, int32_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2CUDAFunction<T, int64_t, int64_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      return;
+    }
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     const auto &index_type = index->type();
@@ -64,6 +91,34 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradCUDAFunction<T, int32_t, int32_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradCUDAFunction<T, int32_t, int64_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradCUDAFunction<T, int64_t, int32_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradCUDAFunction<T, int64_t, int64_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      return;
+    }
+
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index e4ce13ca8fc0b49e997749d0f47f15213a3b44f7..8ec0d6ce0b69c791f9bff58f1681f8d4543c57dd 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -35,6 +35,30 @@ class GatherOpKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2Function<T, int32_t, int32_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2Function<T, int32_t, int64_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2Function<T, int64_t, int32_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2Function<T, int64_t, int64_t>(x, index, axis, output, place);
+      }
+      return;
+    }
+
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
 
@@ -70,6 +94,30 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradFunction<T, int32_t, int32_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradFunction<T, int32_t, int64_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradFunction<T, int64_t, int32_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradFunction<T, int64_t, int64_t>(dO, index, axis, dX, place);
+      }
+      return;
+    }
+
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 253078751ce66dd2a6d52dbdd5fe6b5c0ed21849..111d4ad4490074fb53671f6f3180cf17c5abe913 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <random>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -31,22 +33,29 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
     std::normal_distribution<T> dist(mean, std);
-
     const std::string op_type = "gaussian_random";
     auto shape = GetShape(context, op_type);
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(gen_engine);
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 3bf34fc685ee8af39b66f444c35d606c4b5d8ffb..93f9e108723fbd56e0d3bf5d439614c2c20bb393 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -41,13 +41,14 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
 
     int n = input->dims()[0];
     int c = input->dims()[1];
-    int h = input->dims()[2];
-    int w = input->dims()[3];
-    const int size[4] = {n, c, h, w};
+    int out_h = grid->dims()[1];
+    int out_w = grid->dims()[2];
+    const int size[4] = {n, c, out_h, out_w};
 
     const T* input_data = input->data<T>();
     const T* grid_data = grid->data<T>();
-    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    T* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
 
     ScopedSpatialTransformerDescriptor st_desc;
     cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
@@ -97,7 +98,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
     const T* grid_data = grid->data<T>();
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
+        input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
     T* grid_grad_data =
         grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 5be490379642e8761a6821fa0dc0d332ca5b41ef..deb71b807128e5c0b173b517e60832894ced41e5 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -58,21 +59,10 @@ class GridSampleOp : public framework::OperatorWithKernel {
               "Input(X) and Input(Grid) dimension[0] should be equal, but "
               "received X dimension[0](%d) != Grid dimension[0](%d)",
               x_dims[0], grid_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[1], x_dims[2],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[2] and Input(Grid) dims[1] should be equal, but "
-              "received X dimension[2](%d) != Grid dimension[1](%d)",
-              x_dims[2], grid_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[2], x_dims[3],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[3] and Input(Grid) dims[2] should be equal, but "
-              "received X dimension[3](%d) != Grid dimension[2](%d)",
-              x_dims[3], grid_dims[2]));
     }
 
-    ctx->SetOutputDim("Output", x_dims);
+    ctx->SetOutputDim("Output",
+                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
     ctx->ShareLoD("X", "Output");
   }
 
@@ -108,15 +98,37 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default true) Only used in cudnn kernel, need install cudnn")
         .SetDefault(true);
 
+    AddAttr<bool>(
+        "align_corners",
+        "(bool, default true) If align_corners is true, it will project"
+        "-1 and 1 to the centers of the corner pixels. Otherwise, it will "
+        "project"
+        "-1 and 1 to the image edges.")
+        .SetDefault(true);
+
+    AddAttr<std::string>(
+        "mode",
+        "(bool, default true) The interpolation method which can be 'bilinear'"
+        " or 'nearest'.")
+        .SetDefault("bilinear");
+
+    AddAttr<std::string>(
+        "padding_mode",
+        "(bool, default true) The padding method used when source"
+        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "'border'.")
+        .SetDefault("zeros");
+
     AddComment(R"DOC(
-      This operation samples input X by using bilinear interpolation based on 
+      This operation samples input X by using bilinear or nearest interpolation based on 
       flow field grid, which is usually generated by affine_grid. The grid of
       shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
       with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
       (in width dimension) of input data x and grid_y is indexing the 3rd 
       dimension (in height dimension), finally results is the bilinear 
-      interpolation value of 4 nearest corner points.
+      interpolation value or nearest value of 4 nearest corner points.
 
+      For bilinear interpolation mode:
       Step 1:
         Get (x, y) grid coordinates and scale to [0, H-1/W-1].
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..999f990448ca6370dadacbdaee5bf3bcadcaca0e
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -0,0 +1,490 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/grid_sampler_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename T>
+static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
+                                                  int sW, int H, int W,
+                                                  T delta) {
+  if (in_bounds(h, w, H, W)) {
+    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize(T coord, int size,
+                                                 bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
+                                                    int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T compute_positions(T coord, int size,
+                                                      PaddingMode padding_mode,
+                                                      bool align_corners) {
+  coord = _unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_indexes(coord, -1, 2 * size - 1);
+    }
+    coord = clip_indexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
+                                                           bool align_corners,
+                                                           T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
+                                                           T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
+                            bool align_corners, T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
+                                        int out_h, int out_w, int in_h,
+                                        int in_w, const T* input, const T* grid,
+                                        T* output, const Mode mode,
+                                        const PaddingMode padding_mode,
+                                        bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = compute_positions(ix, in_w, padding_mode, align_corners);
+    iy = compute_positions(iy, in_h, padding_mode, align_corners);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(round(ix));
+      int iy_nearest = static_cast<int>(round(iy));
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+    auto* output = ctx.Output<Tensor>("Output");
+    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    VLOG(3) << "set constant";
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        dev_ctx, output, static_cast<T>(0));
+    int count = static_cast<int>(n * out_h * out_w);
+
+    auto cu_stream = dev_ctx.stream();
+
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
+        grid->data<T>(), output_data, mode, padding_mode, align_corners);
+  }
+};
+
+template <typename T>
+__global__ void grid_sampler_cuda_backward_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* grid,
+    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
+    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
+    bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
+                                     &gix_mult);
+    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
+                                     &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
+                   nw * gOut);
+        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
+                   ne * gOut);
+        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
+                   sw * gOut);
+        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
+                   se * gOut);
+
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = gix_mult * gix;
+      gGrid_ptr_NHW[1] = giy_mult * giy;
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
+                   in_w, grad_output[gOut_offset]);
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = static_cast<T>(0);
+      gGrid_ptr_NHW[1] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T>
+class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        input_grad, static_cast<T>(0));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    grid_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        grid_grad, static_cast<T>(0));
+
+    int count = static_cast<int>(n * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
+        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad->data<T>(),
+        mode, padding_mode, align_corners);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
+                        ops::GridSampleOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
+                        ops::GridSampleGradOpCUDAKernel<float>,
+                        ops::GridSampleGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 08a6043eb07a6e44d46428ee195f6cb28c2ee77c..eda800e78faf5da2bb379b8101e4823c5bc2d2f8 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <iostream>
+#include <string>
+#include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
@@ -22,6 +25,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+enum class Mode {
+  bilinear,
+  nearest,
+};
+
+enum class PaddingMode { zeros, border, reflect };
+
 using Tensor = framework::Tensor;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -39,64 +49,229 @@ static inline bool isInBound(T x, T y, T x_max, T y_max) {
 }
 
 template <typename T>
-static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, Tensor* x_w, Tensor* x_e,
-                              Tensor* y_n, Tensor* y_s, Tensor* d_w,
-                              Tensor* d_e, Tensor* d_n, Tensor* d_s) {
+static inline void unnormalize(const platform::CPUDeviceContext& ctx,
+                               Tensor* grid_slice,
+                               const int max_val,  // height-1 or width-1
+                               bool align_corners) {
   auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+static inline void clip(const platform::CPUDeviceContext& ctx,
+                        Tensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners, std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflect") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <typename T>
+static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners, std::string padding_mode,
+                                Tensor* grid_slice, Tensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
+
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflect") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+
+template <typename T>
+static void calcGridLocations(const platform::CPUDeviceContext& ctx,
+                              const Tensor& grid, const int in_h,
+                              const int in_w, bool align_corners,
+                              std::string padding_mode, Tensor* grid_x,
+                              Tensor* grid_y) {
   const int n = grid.dims()[0];
-  const int h = grid.dims()[1];
-  const int w = grid.dims()[2];
-  const T x_max = static_cast<T>(w - 1);
-  const T y_max = static_cast<T>(h - 1);
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
 
   // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  Tensor grid_x, grid_y;
-  T* grid_x_data = grid_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  T* grid_y_data = grid_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * h * w; i++) {
+  for (int i = 0; i < n * out_h * out_w; i++) {
     grid_x_data[i] = grid_data[2 * i];
     grid_y_data[i] = grid_data[(2 * i) + 1];
   }
 
-  Tensor ones;
-  ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
-  Tensor half_xmax;
-  Tensor half_ymax;
-  half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_xmax_t =
-      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
-  half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_ymax_t =
-      EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
-
-  // scale grid to [0, h-1/w-1]
-  auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-  grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
-  grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
+  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
+                                      const Tensor& grid, const int in_h,
+                                      const int in_w, bool align_corners,
+                                      std::string padding_mode, Tensor* grid_x,
+                                      Tensor* grid_y, Tensor* grid_x_scale,
+                                      Tensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
 
+  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
+                  grid_x_scale);
+  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
+                  grid_y_scale);
+}
+
+template <typename T>
+static void getGridPointValue(const Tensor& input, Tensor* output,
+                              const Tensor& x, const Tensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void allNeigbors(const platform::CPUDeviceContext& ctx,
+                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
+                        Tensor* y_s,  // positions
+                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
+                        Tensor* d_s,  // distance
+                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
+                        Tensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
   // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   auto x_w_t = EigenTensor<T, 3>::From(*x_w);
   auto x_e_t = EigenTensor<T, 3>::From(*x_e);
   auto y_n_t = EigenTensor<T, 3>::From(*y_n);
   auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
   x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + ones_t;
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
   y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + ones_t;
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
 
   // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   auto d_w_t = EigenTensor<T, 3>::From(*d_w);
   auto d_e_t = EigenTensor<T, 3>::From(*d_e);
   auto d_n_t = EigenTensor<T, 3>::From(*d_n);
@@ -105,28 +280,100 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
   d_e_t.device(place) = x_e_t - grid_x_t;
   d_n_t.device(place) = grid_y_t - y_n_t;
   d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  getGridPointValue<T>(input, v_en, *x_e, *y_n);
+  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  getGridPointValue<T>(input, v_es, *x_e, *y_s);
 }
 
 template <typename T>
-static void GetGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
+static void bilinearInter(const platform::CPUDeviceContext& ctx,
+                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                          Tensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
   const int c = input.dims()[1];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
+                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <typename T>
+static void nearestInter(const platform::CPUDeviceContext& ctx,
+                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                         Tensor* out) {
+  auto& place = *ctx.eigen_device();
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+  grid_x_t = grid_x_t.round();
+  grid_y_t = grid_y_t.round();
+  getGridPointValue<T>(input, out, *grid_x, *grid_y);
+}
+
+template <typename T>
+static void gatherOutputGradToInputGrad(const Tensor& output_grad,
+                                        Tensor* input_grad, const Tensor& x,
+                                        const Tensor& y, const Tensor& d1,
+                                        const Tensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
   auto x_t = EigenTensor<T, 3>::From(x);
   auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
 
   for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
           for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
+            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
           }
         }
       }
@@ -135,29 +382,28 @@ static void GetGridPointValue(const Tensor& input, Tensor* output,
 }
 
 template <typename T>
-static void GatherOutputGradToInputGrad(const Tensor& output_grad,
+static void gatherOutputGradToInputGrad(const Tensor& output_grad,
                                         Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
+                                        const Tensor& y) {
   const int n = output_grad.dims()[0];
   const int c = output_grad.dims()[1];
-  const int h = output_grad.dims()[2];
-  const int w = output_grad.dims()[3];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
   auto x_t = EigenTensor<T, 3>::From(x);
   auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
   for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
           for (int j = 0; j < c; j++) {
             input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
                          static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+                output_grad_t(i, j, k, l);
           }
         }
       }
@@ -165,65 +411,126 @@ static void GatherOutputGradToInputGrad(const Tensor& output_grad,
   }
 }
 
+template <typename T>
+static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
+                               const Tensor& input, const Tensor& output_grad,
+                               Tensor* grid_x, Tensor* grid_y,
+                               Tensor* grid_x_scale, Tensor* grid_y_scale,
+                               Tensor* input_grad, Tensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  allNeigbors<T>(ctx, input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
+                 &v_ws, &v_es);
+
+  // gather output grad value to input grad by corner point coords and weight
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  Tensor grid_grad_x, grid_grad_y;
+  grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  auto grid_grad_x_t =
+      EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+  auto grid_grad_y_t =
+      EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < c; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          grid_grad_x_t(i, k, l) +=
+              ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+               (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+              output_grad_t(i, j, k, l);
+          grid_grad_y_t(i, k, l) +=
+              ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+               (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+              output_grad_t(i, j, k, l);
+        }
+      }
+    }
+  }
+
+  //  const T x_max = static_cast<T>(in_w - 1);
+  //  const T y_max = static_cast<T>(in_h - 1);
+
+  auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+  auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+  grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+  grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+
+  // gather grid_grad [x, y] in 3rd Dim
+  T* grid_grad_data = grid_grad->data<T>();
+  T* grid_grad_x_data = grid_grad_x.data<T>();
+  T* grid_grad_y_data = grid_grad_y.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_grad_data[2 * i] = grid_grad_x_data[i];
+    grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+  }
+}
+
 template <typename DeviceContext, typename T>
 class GridSampleOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode = ctx.Attr<std::string>("padding_mode");
+    auto mode = ctx.Attr<std::string>("mode");
+
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
 
-    const int n = input->dims()[0];
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
     const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    // calc locations and distances of 4 corner points
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
     auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
 
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-    auto d_w_scaled_t =
-        d_w_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_e_scaled_t =
-        d_e_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_n_scaled_t =
-        d_n_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_s_scaled_t =
-        d_s_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-    auto output_t = EigenTensor<T, 4>::From(*output);
-    // bilinear interpolaetion by 4 corner points
-    output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                             v_en_t * d_w_scaled_t * d_s_scaled_t +
-                             v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                             v_es_t * d_w_scaled_t * d_n_scaled_t;
+    Tensor grid_x, grid_y;
+    calcGridLocations<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
+        in_w, align_corners, padding_mode, &grid_x, &grid_y);
+    if (mode == "bilinear") {
+      bilinearInter<T>(
+          ctx.template device_context<platform::CPUDeviceContext>(), *input,
+          &grid_x, &grid_y, output);
+    } else if (mode == "nearest") {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      getGridPointValue<T>(*input, output, grid_x, grid_y);
+    }
   }
 };
 
@@ -231,97 +538,48 @@ template <typename DeviceContext, typename T>
 class GridSampleGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode = ctx.Attr<std::string>("padding_mode");
+    auto mode = ctx.Attr<std::string>("mode");
+
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
 
-    const int n = input->dims()[0];
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
     const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), input_grad,
         static_cast<T>(0));
     auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-    grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+    grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), grid_grad,
         static_cast<T>(0));
-
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
-
-    // gather output grad value to input grad by corner point coords and weight
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_s, d_e,
-                                   d_n);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_n, d_w,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_s, d_w,
-                                   d_n);
-
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-    auto output_grad_t = EigenTensor<T, 4>::From(*output_grad);
-
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    auto grid_grad_x_t = EigenTensor<T, 3>::From(grid_grad_x).setConstant(0.0);
-    auto grid_grad_y_t = EigenTensor<T, 3>::From(grid_grad_y).setConstant(0.0);
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-    const T x_max = static_cast<T>(w - 1);
-    const T y_max = static_cast<T>(h - 1);
-    grid_grad_x_t = grid_grad_x_t * (x_max / (T)2);
-    grid_grad_y_t = grid_grad_y_t * (y_max / (T)2);
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * h * w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    Tensor grid_x, grid_y;
+    Tensor grid_x_scale, grid_y_scale;
+    calcGridLocationsWithGrad<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
+        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
+        &grid_y_scale);
+    if (mode == "bilinear") {
+      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
+                            *input, *output_grad, &grid_x, &grid_y,
+                            &grid_x_scale, &grid_y_scale, input_grad,
+                            grid_grad);
+    } else {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
     }
   }
 };
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
index 09c743c4275169ba8c53ccbd428100b2fc4483d6..4ce6856a7eade1b314d8aef1d039424ad42e07cf 100644
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -16,7 +16,9 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12733a0d9f1689a020f77d23cc31b0d19b412746
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -0,0 +1,695 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/interpolate_v2_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE_EQ("linear", interp_method,
+                    platform::errors::InvalidArgument(
+                        "Interpolation method can only be \"linear\" when"
+                        "Input(X) dimension is 3, but got method = %s .",
+                        interp_method));
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 1,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'size of Op(interpolate) must be 1. "
+            "Attr(out_shape)'s length must be 1 for 3-D input tensor, but got "
+            "size = %d .",
+            inputs_name.size()));
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_w};
+    } else {
+      dim_out = {dim_x[0], out_w, dim_x[2]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got dimension = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(
+        scale_tensor[0], 1,
+        platform::errors::InvalidArgument(
+            "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_w = -1;
+      scale_w = scale[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+      if (scale_w > 0.) {
+        // round down
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_w)
+                     : static_cast<int>(dim_x[1] * scale_w));
+        // protect when input shape is -1
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got dimention = %d .",
+            out_size_dim.size()));
+    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
+                                              "OutSize's dim[0] must be 1"));
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_w};
+  } else {
+    dim_out = {dim_x[0], out_w, dim_x[2]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE(
+      "bilinear" == interp_method || "nearest" == interp_method ||
+          "bicubic" == interp_method,
+      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
+      "Input(X) dimension is 4, but got method = %s .",
+      interp_method);
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 2,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'size of Op(interpolate) must be 2. "
+            "Attr(out_shape)'s length must be 2 for 4-D input "
+            "tensor, but got size = %d .",
+            inputs_name.size()));
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_h, out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got dimension = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(scale_tensor[0] == 2 || scale_tensor[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "Scale's shape must be 2 or 1, but got shape = %d .",
+                          scale_tensor[0]));
+    // out_h = -1;
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_h = -1;
+      float scale_w = -1;
+      scale_h = scale[0];
+      scale_w = scale[1];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+      if (scale_h > 0. && scale_w > 0.) {
+        // round down
+        out_h = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_h)
+                     : static_cast<int>(dim_x[1] * scale_h));
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[3] * scale_w)
+                     : static_cast<int>(dim_x[2] * scale_w));
+        // protect when input shape is -1
+        out_h = out_h > 0 ? out_h : -1;
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got dimension = %d .",
+            out_size_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 2,
+        platform::errors::InvalidArgument(
+            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            out_size_dim[0]));
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE_EQ(
+      "trilinear", interp_method,
+      platform::errors::InvalidArgument(
+          "Interpolation method can only be \"trilinear\" when Input(X) "
+          "dimension is 5, but got method = %s .",
+          interp_method));
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 3,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'s size of Op(interpolate) must be 3. "
+            "Attr(out_shape)'s length must be 3 for 5-D input "
+            "tensor, but got size = %d .",
+            inputs_name.size()));
+    int out_d = ctx->Attrs().Get<int>("out_d");
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_d, out_h, out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got size = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(scale_tensor[0] == 3 || scale_tensor[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "Scale's shape must be 3 or 1, but got shape = %d .",
+                          scale_tensor[0]));
+    // out_d = -1;
+    // out_h = -1;
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_d = -1;
+      float scale_h = -1;
+      float scale_w = -1;
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+      if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+        // round down
+        out_d = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_d)
+                     : static_cast<int>(dim_x[1] * scale_d));
+        out_h = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[3] * scale_h)
+                     : static_cast<int>(dim_x[2] * scale_h));
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[4] * scale_w)
+                     : static_cast<int>(dim_x[3] * scale_w));
+        // protect when input shape is -1
+        out_d = out_d > 0 ? out_d : -1;
+        out_h = out_h > 0 ? out_h : -1;
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_d = ctx->Attrs().Get<int>("out_d");
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                      "OutSize's dimension size must be 1, but got size =%d .",
+                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
+                      "OutSize's dim[0] must be 3, but got size = %d .",
+                      out_size_dim[0]);
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+class InterpolateV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of InterpolateV2Op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of InterpolationOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    PADDLE_ENFORCE(
+        dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5,
+        platform::errors::Unimplemented(
+            "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .",
+            dim_x.size()));
+
+    if (dim_x.size() == 3) {
+      // shape check for 1D interpolate for input tensor shape NCHW
+      Interpolate1DInferShapeCheck(ctx);
+    } else if (dim_x.size() == 4) {
+      // shape check for 2D interpolate for input tensor shape NCHW
+      Interpolate2DInferShapeCheck(ctx);
+    } else {  // dim_x.size() == 5
+      // shape check for 3D interpolate for input tensor shape NCDHW
+      Interpolate3DInferShapeCheck(ctx);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "SizeTensor" || var_name == "Scale") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of interpolate operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W] or a "
+             "5-D tensor with shape of [N, C, D, H, W].");
+    AddInput("OutSize",
+             "This is a 1-D tensor with two numbers to specify output size. "
+             "It should be [output_height, output_width] when input is a 4-D "
+             "tensor and should be [output_depth, output_height, output_width] "
+             "when input is a 5-D tensor. It has a higher priority than "
+             "the attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
+        .AsDispensable();
+    AddInput("SizeTensor",
+             "(vector<Tensor<int32>>, optional). If provided, interpolate will "
+             "use this. The shape of the tensor in vector MUST BE [1]. "
+             "It has the highest priority compare with Input(OutSize) and "
+             "attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("Scale",
+             "This is a 1-D tensor with one number to specify output scale. "
+             "It has the higher priority compare with attr(scale).")
+        .AsDispensable();
+    AddOutput("Out",
+              "The output tensor of interpolate operator, "
+              "This is a tensor in same rank with Input(X).");
+
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
+    AddAttr<int>("out_d", "output depth of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_h", "output height of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_w", "output width of interpolate op.").SetDefault(0);
+    AddAttr<std::vector<float>>("scale", "scale_d factor of interpolate op.")
+        .SetDefault(std::vector<float>{});
+    AddAttr<std::string>("interp_method",
+                         "(string, default \"bilinear\"), interpolation "
+                         "method, can be \"linear\" for linear interpolation"
+                         ",\"bilinear\" for "
+                         "bilinear interpolation, \"trilinear\" for trilinear "
+                         "interpolation and \"nearest\" for nearest "
+                         "neighbor interpolation, and \"bicubic\" for bicubic"
+                         "interpolation.")
+        .SetDefault("bilinear");
+    AddAttr<bool>(
+        "align_corners",
+        "an optional bool. Defaults to True. "
+        "If True, the centers of 4 corner pixels of the input and output "
+        "tensors are aligned, preserving the values at the corner pixels, "
+        "If False, are not aligned")
+        .SetDefault(true);
+    AddAttr<int>("align_mode",
+                 "(int, default \'1\'), optional for bilinear interpolation, "
+                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
+                 "can be \'1\' for src_idx = scale*dst_index .")
+        .SetDefault(1);
+    AddComment(R"DOC(
+          This operator samples input X to given output shape by using specified
+          interpolation method, the interpolation methods can be \"nearest\"
+          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          interpolation and \"linear\" for linear interpolation..
+
+          Nearest neighbor interpolation is to perform nearest neighbor interpolation
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          direction) on input tensor.
+           
+          Linear interpolation is the method of using a line connecting two known quantities 
+          to determine the value of an unknown quantity between the two known quantities. 
+          
+          Bilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of two variables (e.g. H-direction and 
+          W-direction in this op) on a rectilinear 2D grid. The key idea is 
+          to perform linear interpolation first in one direction, and then 
+          again in the other direction.
+
+          Trilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of three variables (e.g. D-direction, 
+          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          The linear interpolation is performed on three directions.
+
+          Bicubic interpolation is an extension of cubic interpolation for interpolating
+          data points on a two-dimensional regular grid. The interpolated surface is
+          smoother than corresponding surfaces obtained by bilinear interpolation or
+          nearest-neighbor interpolation.
+
+          Align_corners and align_mode are optional parameters,the calculation method 
+          of interpolation can be selected by them.
+          
+          Example:
+
+          For scale:
+          
+            if align_corners = True and out_{size}>1 :
+
+              scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
+            
+            else:
+              
+              scale_{factor} = float(in_{size}/out_{size})
+            
+          
+          Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+              W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+          else:
+              align_corners = True
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+
+          Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          Trilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              
+              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+
+              D_out = D_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          Bicubic interpolation:
+
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
+
+          For details of bilinear interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Bilinear_interp_v2olation
+
+          For details of trilinear interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Trilinear_interp_v2olation
+
+          For details of bicubic interpolation, please refer to Wikipedia:
+          https://en.wikipedia.org/wiki/Bicubic_interpolation
+         )DOC");
+  }
+};
+
+class InterpolateV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "SizeTensor" || var_name == "Scale") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class InterpolateV2GradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    if (this->HasInput("SizeTensor") > 0) {
+      op->SetInput("SizeTensor", this->Input("SizeTensor"));
+    }
+    if (this->HasInput("OutSize") > 0) {
+      op->SetInput("OutSize", this->Input("OutSize"));
+    }
+    if (this->HasInput("Scale") > 0) {
+      op->SetInput("Scale", this->Input("Scale"));
+    }
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
+                                    "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bilinear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(nearest_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(nearest_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(trilinear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(trilinear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(bicubic_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bicubic_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(trilinear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(trilinear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OPERATOR(linear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(linear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(linear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(linear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(bicubic_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>);
+REGISTER_OP_CPU_KERNEL(bicubic_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6cb8104638dea458743374014e7bef35df2dbfcc
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -0,0 +1,1578 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/operators/interpolate_v2_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__global__ void KeNearestNeighborInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    if (data_layout == DataLayout::kNCHW) {
+      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    } else {
+      out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeNearestNeighborInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+    const T out_pos = out[out_id_h * output_w + out_id_w];
+    platform::CudaAtomicAdd(in_pos, out_pos);
+  }
+}
+
+template <typename T>
+__global__ void KeLinearInterpFw(const T* in, const size_t in_img_w,
+                                 const size_t input_w, T* out,
+                                 const size_t out_img_w, const size_t output_h,
+                                 const size_t output_w,
+                                 const size_t num_channels, const float ratio_w,
+                                 const bool align_corners, const int align_mode,
+                                 const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
+
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      const T* in_pos =
+          &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx];
+      // linear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          w2lambda * in_pos[0] + w1lambda * in_pos[w_id];
+
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
+      // linear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeLinearInterpBw(T* in, const size_t in_img_w,
+                                 const size_t input_w, const T* out,
+                                 const size_t out_img_w, const size_t output_h,
+                                 const size_t output_w,
+                                 const size_t num_channels, const T ratio_w,
+                                 const bool align_corners, const int align_mode,
+                                 const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
+
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
+    }
+    const T* out_pos = &out[out_id_w];
+
+    if (data_layout == DataLayout::kNCHW) {
+      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                            in_img_idy * in_img_w + in_img_idx];
+
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                      w1lambda * in_pos[h_id * in_img_w + w_id]);
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+              in_img_idx * num_channels + channel_id];
+
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda *
+              (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
+                      w1lambda * in_pos[h_id * in_img_w * num_channels +
+                                        w_id * num_channels]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
+                                : ratio_h * out_img_idy;
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+    if (data_layout == DataLayout::kNCHW) {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+                              h1lambda * w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
+          h1lambda * w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTrilinearInterpFw(
+    const T* in, const size_t in_img_d, const size_t in_img_h,
+    const size_t in_img_w, const size_t input_h, const size_t input_w, T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const float ratio_d, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = align_flag
+                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
+    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
+    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
+    src_d = (src_d > 0) ? src_d : 0;
+    T d1lambda =
+        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
+    T d2lambda = 1.f - d1lambda;
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+                        in_img_idx;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      const T* in_pos2 = &in[in_pos2_idx];
+
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
+                           w1lambda * in_pos1[h_id * in_img_w + w_id])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
+                           w1lambda * in_pos2[h_id * in_img_w + w_id]));
+
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      const T* in_pos2 = &in[in_pos2_idx];
+
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] +
+                           w1lambda * in_pos1[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos1[h_id * in_img_w * num_channels +
+                                              w_id * num_channels])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] +
+                           w1lambda * in_pos2[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos2[h_id * in_img_w * num_channels +
+                                              w_id * num_channels]));
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTrilinearInterpBw(
+    T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, const T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners,
+    const int align_mode, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = align_flag
+                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
+    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
+    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
+    src_d = (src_d > 0) ? src_d : 0;
+    T d1lambda =
+        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
+    T d2lambda = 1.f - d1lambda;
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+                        in_img_idx;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      T* in_pos2 = &in[in_pos2_idx];
+
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+                              d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                              d1lambda * h1lambda * w1lambda * out_pos[0]);
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      T* in_pos2 = &in[in_pos2_idx];
+
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
+          d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
+          d1lambda * h1lambda * w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ static T Kecubic_interp(const T x0, const T x1,
+                                                   const T x2, const T x3,
+                                                   T t) {
+  T coeffs[4];
+  T a = -0.75;
+  T x_1 = t;
+  T x_2 = 1.0 - t;
+  coeffs[0] = cubic_convolution2<T>(x_1 + 1.0, a);
+  coeffs[1] = cubic_convolution1<T>(x_1, a);
+  coeffs[2] = cubic_convolution1<T>(x_2, a);
+  coeffs[3] = cubic_convolution2<T>(x_2 + 1.0, a);
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+template <typename T>
+__global__ void KeBicubicInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    T in_img_idy = align_corners
+                       ? static_cast<T>(ratio_h * out_img_idy)
+                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    int input_y = floorf(in_img_idy);
+    const T y_t = in_img_idy - input_y;
+
+    T in_img_idx = align_corners
+                       ? static_cast<T>(ratio_w * out_img_idx)
+                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    int input_x = floorf(in_img_idx);
+    const T x_t = in_img_idx - input_x;
+
+    T coefficients[4];
+    const T* in_pos_0;
+    const T* in_pos_1;
+    const T* in_pos_2;
+    const T* in_pos_3;
+    int access_x_0;
+    if (data_layout == DataLayout::kNCHW) {
+      for (int k = 0; k < 4; k++) {
+        int access_y =
+            max(min(input_y - 1 + k, static_cast<int>(in_img_h - 1)), 0);
+        access_x_0 = max(min(input_x - 1, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_1 =
+            max(min(input_x + 0, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_2 =
+            max(min(input_x + 1, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_3 =
+            max(min(input_x + 2, static_cast<int>(in_img_w - 1)), 0);
+
+        in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_0];
+        in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_1];
+        in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_2];
+        in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_3];
+
+        coefficients[k] = Kecubic_interp<T>(in_pos_0[0], in_pos_1[0],
+                                            in_pos_2[0], in_pos_3[0], x_t);
+      }
+
+      out[out_id_h * output_w + out_id_w] =
+          Kecubic_interp<T>(coefficients[0], coefficients[1], coefficients[2],
+                            coefficients[3], y_t);
+
+    } else {
+      for (int k = 0; k < 4; k++) {
+        int access_y =
+            max(min(input_y - 1 + k, static_cast<int>((in_img_h - 1))), 0);
+        int access_x_0 =
+            max(min(input_x - 1, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_1 =
+            max(min(input_x + 0, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_2 =
+            max(min(input_x + 1, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_3 =
+            max(min(input_x + 2, static_cast<int>((in_img_w - 1))), 0);
+
+        const T* in_pos_0 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_0 * num_channels + channel_id];
+        const T* in_pos_1 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_1 * num_channels + channel_id];
+        const T* in_pos_2 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_2 * num_channels + channel_id];
+        const T* in_pos_3 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_3 * num_channels + channel_id];
+
+        coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], in_pos_2[0],
+                                         in_pos_3[0], x_t);
+      }
+
+      out[out_id_h * output_w + out_id_w] =
+          static_cast<T>(Kecubic_interp(coefficients[0], coefficients[1],
+                                        coefficients[2], coefficients[3], y_t));
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBicubicInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    T in_img_idy = align_corners
+                       ? static_cast<T>(ratio_h * out_img_idy)
+                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    int input_y = floorf(in_img_idy);
+    const T y_t = in_img_idy - input_y;
+
+    T in_img_idx = align_corners
+                       ? static_cast<T>(ratio_w * out_img_idx)
+                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    int input_x = floorf(in_img_idx);
+
+    const T x_t = in_img_idx - input_x;
+
+    T x_coeffs[4];
+    T y_coeffs[4];
+
+    get_cubic_upsample_coefficients(x_coeffs, x_t);
+    get_cubic_upsample_coefficients(y_coeffs, y_t);
+
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    T* in_pos;
+
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        int access_y = max(min(static_cast<int>(input_y - 1 + j),
+                               static_cast<int>(in_img_h - 1)),
+                           0);
+        int access_x = max(min(static_cast<int>(input_x - 1 + i),
+                               static_cast<int>(in_img_w - 1)),
+                           0);
+        if (data_layout == DataLayout::kNCHW) {
+          in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x];
+        } else {
+          in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                       access_x * num_channels + channel_id];
+        }
+        platform::CudaAtomicAdd(&in_pos[0],
+                                (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+      }
+    }
+  }
+}
+
+template <typename T>
+static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_w = new_size[0];
+  } else {
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale_w = scale_data[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    } else {
+      if (scale.size() > 0) {
+        scale_w = scale[0];
+        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                                 "scale  of Op(interpolate) "
+                                                 "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0.) {
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_w = size_data[0];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_w};
+  } else {
+    dim_out = {n, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_cw = c * in_w;
+  int out_cw = c * out_w;
+  int pixelNum = n * out_cw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("linear" == interp_method) {
+    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+                          ctx.cuda_device_context().stream()>>>(
+        input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
+        align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_w = scale[1];
+        scale_h = scale[0];
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0. && scale_h > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_hw = in_h * in_w;
+  int out_hw = out_h * out_w;
+  int in_chw = c * in_hw;
+  int out_chw = c * out_hw;
+
+  int pixelNum = n * out_chw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("nearest" == interp_method) {
+    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  } else if ("bilinear" == interp_method) {
+    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
+  } else if ("bicubic" == interp_method) {
+    KeBicubicInterpFw<
+        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  } else {
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_d = scale_data[0];
+        scale_h = scale_data[1];
+        scale_w = scale_data[2];
+      } else {
+        scale_d = scale_data[0];
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_d = scale[0];
+        scale_h = scale[1];
+        scale_w = scale[2];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+      out_d = static_cast<int>(in_d * scale_d);
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_d = size_data[0];
+      out_h = size_data[1];
+      out_w = size_data[2];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
+                                  "out_d in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_dhw = in_d * in_h * in_w;
+  int out_dhw = out_d * out_h * out_w;
+  int in_cdhw = c * in_dhw;
+  int out_cdhw = c * out_dhw;
+
+  int pixelNum = n * out_cdhw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("trilinear" == interp_method) {
+    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
+        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
+        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    scale_w = scale_data[0];
+    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                             "scale  of Op(interpolate) "
+                                             "should be greater than 0."));
+  } else {
+    if (scale.size() > 0) {
+      scale_w = scale[0];
+
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0.) {
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_w = size_data[0];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_w};
+  } else {
+    dim_grad = {n, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  int in_cw = c * in_w;
+  int out_cw = c * out_w;
+  int pixelNum = n * out_cw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("linear" == interp_method) {
+    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+                          ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
+        ratio_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_h = scale_data[0];
+      scale_w = scale_data[1];
+    } else {
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_w = scale[1];
+      scale_h = scale[0];
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0. && scale_h > 0.) {
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_h = size_data[0];
+    out_w = size_data[1];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_hw = in_h * in_w;
+  int out_hw = out_h * out_w;
+  int in_chw = c * in_hw;
+  int out_chw = c * out_hw;
+
+  int pixelNum = n * out_chw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("nearest" == interp_method) {
+    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  } else if ("bilinear" == interp_method) {
+    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
+        data_layout);
+  } else if ("bicubic" == interp_method) {
+    KeBicubicInterpBw<
+        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad,
+                                 const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_d = scale_data[0];
+      scale_h = scale_data[1];
+      scale_w = scale_data[2];
+    } else {
+      scale_d = scale_data[0];
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+    out_d = static_cast<int>(in_d * scale_d);
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_d = size_data[0];
+    out_h = size_data[1];
+    out_w = size_data[2];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_dhw = in_d * in_h * in_w;
+  int out_dhw = out_d * out_h * out_w;
+  int in_cdhw = c * in_dhw;
+  int out_cdhw = c * out_dhw;
+
+  int pixelNum = n * out_cdhw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("trilinear" == interp_method) {
+    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
+        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        align_mode, data_layout);
+  }
+}
+
+template <typename T>
+class InterpolateOpV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::NotFound("This kernel only runs on GPU device."));
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    if (input_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCUDAFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCUDAFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCUDAFwd<T>(ctx, *input, output);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::NotFound("This kernel only runs on GPU device."));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+    if (output_grad_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCUDABwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCUDABwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCUDABwd<T>(ctx, input_grad, *output_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(linear_interp_v2, ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(linear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..111766934b8300c0a7b46ae9a065b8c42460e577
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -0,0 +1,1386 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+inline std::vector<int> get_new_shape(
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    PADDLE_ENFORCE_EQ(
+        tensor->dims(), framework::make_ddim({1}),
+        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    if (platform::is_gpu_place(tensor->place())) {
+      framework::Tensor temp;
+      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
+    } else {
+      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+    }
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  framework::Tensor cpu_starts_tensor;
+  if (platform::is_gpu_place(new_data_tensor->place())) {
+    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    new_data = cpu_starts_tensor.data<T>();
+  }
+  vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
+  return vec_new_data;
+}
+
+inline void ExtractNCDWH(const framework::DDim& dims,
+                         const DataLayout& data_layout, int* N, int* C, int* D,
+                         int* H, int* W) {
+  *N = dims[0];
+
+  if (dims.size() == 3) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2];
+    *D = 1;
+    *H = 1;
+    *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+  } else if (dims.size() == 4) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3];
+    *D = 1;
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4];
+    *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+    *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3];
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
+                                       const float ratio_h, const float ratio_w,
+                                       const int n, const int c,
+                                       const int out_h, const int out_w,
+                                       const bool align_corners,
+                                       const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
+          } else {
+            output_t(i, k, l, j) = input_t(i, in_k, in_l, j);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void LinearInterpolation(const Tensor& input, Tensor* output,
+                                const float ratio_w, const int in_w,
+                                const int n, const int c, const int out_w,
+                                const bool align_corners, const bool align_mode,
+                                const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 3>::From(input);
+  auto output_t = EigenTensor<T, 3>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;                       // w
+    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id
+
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
+    float d_e = 1.f - d_w;                                         // w2lambda
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+  for (int i = 0; i < n; i++) {    // loop for batches
+    for (int j = 0; j < c; j++) {  // loop for channels
+      for (int l = 0; l < out_w; l++) {
+        // linear interpolation
+        T out_t;
+        if (data_layout == DataLayout::kNCHW) {
+          out_t = input_t(i, j, vx_w[l]) * vd_e[l] +
+                  input_t(i, j, vx_e[l]) * vd_w[l];
+          output_t(i, j, l) = out_t;
+        } else {
+          out_t = input_t(i, vx_w[l], j) * vd_e[l] +
+                  input_t(i, vx_e[l], j) * vd_w[l];
+          output_t(i, l, j) = out_t;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void LinearInterpolationGrad(const Tensor& output_grad,
+                                    Tensor* input_grad, const float ratio_w,
+                                    const int in_w, const int n, const int c,
+                                    const int out_w, const bool align_corners,
+                                    const int align_mode,
+                                    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 3>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 3>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int l = 0; l < out_w; l++) {
+    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;                       // w
+    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id
+
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
+    float d_e = 1.f - d_w;                                         // w2lambda
+
+    for (int i = 0; i < n; i++) {    // loop for batches
+      for (int j = 0; j < c; j++) {  // loop for channels
+        // linear interpolation grad
+        if (data_layout == DataLayout::kNCHW) {
+          const T grad = output_grad_t(i, j, l);
+          input_grad_t(i, j, x_w) += static_cast<T>(grad * d_e);
+          input_grad_t(i, j, x_e) += static_cast<T>(grad * d_w);
+        } else {
+          const T grad = output_grad_t(i, l, j);
+          input_grad_t(i, x_w, j) += static_cast<T>(grad * d_e);
+          input_grad_t(i, x_e, j) += static_cast<T>(grad * d_w);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolation(const Tensor& input, Tensor* output,
+                                  const float ratio_h, const float ratio_w,
+                                  const int in_h, const int in_w, const int n,
+                                  const int c, const int out_h, const int out_w,
+                                  const bool align_corners,
+                                  const bool align_mode,
+                                  const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = (align_mode == 0 && !align_corners)
+                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                  : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;
+    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(4)
+#endif
+  for (int i = 0; i < n; i++) {          // loop for batches
+    for (int j = 0; j < c; j++) {        // loop for channels
+      for (int k = 0; k < out_h; k++) {  // loop for images
+        for (int l = 0; l < out_w; l++) {
+          // bilinear interpolation
+          T out_t;
+          if (data_layout == DataLayout::kNCHW) {
+            out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
+                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
+                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
+                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
+            output_t(i, j, k, l) = out_t;
+
+          } else {
+            out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] +
+                    input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] +
+                    input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] +
+                    input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l];
+            output_t(i, k, l, j) = out_t;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void TrilinearInterpolation(
+    const Tensor& input, Tensor* output, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
+    const int in_w, const int n, const int c, const int out_d, const int out_h,
+    const int out_w, const bool align_corners, const bool align_mode,
+    const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 5>::From(input);
+  auto output_t = EigenTensor<T, 5>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vt_f, vt_b;
+  std::vector<float> vd_f, vd_b;
+  vt_f.reserve(out_d);
+  vt_b.reserve(out_d);
+  vd_f.reserve(out_d);
+  vd_b.reserve(out_d);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int j = 0; j < out_d; j++) {
+    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * j);
+    t_f = (t_f > 0) ? t_f : 0;
+    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
+    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
+    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_b = 1.f - d_f;
+    {
+      vt_f[j] = t_f;
+      vt_b[j] = t_b;
+      vd_f[j] = d_f;
+      vd_b[j] = d_b;
+    }
+  }
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = (align_mode == 0 && !align_corners)
+                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                  : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;
+    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(5)
+#endif
+  for (int b = 0; b < n; b++) {          // loop for batches
+    for (int i = 0; i < c; i++) {        // loop for channels
+      for (int j = 0; j < out_d; j++) {  // loop for D, H, W
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            // trilinear interpolation
+            if (data_layout == DataLayout::kNCHW) {
+              T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, i, j, k, l) = out_t;
+            } else {
+              T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, j, k, l, i) = out_t;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline T cubic_convolution1(T x, T A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename T>
+HOSTDEVICE inline T cubic_convolution2(T x, T A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename T>
+HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) {
+  T A = -0.75;
+
+  T x1 = t;
+  coeffs[0] = cubic_convolution2<T>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<T>(x1, A);
+
+  // opposite coefficients
+  T x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<T>(x2, A);
+  coeffs[3] = cubic_convolution2<T>(x2 + 1.0, A);
+}
+
+template <typename T>
+static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) {
+  T coeffs[4];
+  get_cubic_upsample_coefficients<T>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+template <typename T>
+static void BicubicInterpolation(const Tensor& input, Tensor* output,
+                                 const float ratio_h, const float ratio_w,
+                                 const int in_h, const int in_w, const int n,
+                                 const int c, const int out_h, const int out_w,
+                                 const bool align_corners,
+                                 const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    T y_n = align_corners ? static_cast<T>(ratio_h * k)
+                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    int input_y = floorf(y_n);
+    const T y_t = y_n - input_y;
+
+    for (int l = 0; l < out_w; l++) {
+      T x_n = align_corners ? static_cast<T>(ratio_w * l)
+                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      int input_x = floorf(x_n);
+      const T x_t = x_n - input_x;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          T coefficients[4];
+          // interp 4 times in x direction
+          for (int ii = 0; ii < 4; ii++) {
+            int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1),
+                                    static_cast<int>(0));
+            int access_x_0 =
+                std::max(std::min(input_x - 1, in_w - 1), static_cast<int>(0));
+            int access_x_1 =
+                std::max(std::min(input_x + 0, in_w - 1), static_cast<int>(0));
+            int access_x_2 =
+                std::max(std::min(input_x + 1, in_w - 1), static_cast<int>(0));
+            int access_x_3 =
+                std::max(std::min(input_x + 2, in_w - 1), static_cast<int>(0));
+            if (data_layout == DataLayout::kNCHW) {
+              coefficients[ii] =
+                  cubic_interp<T>(input_t(i, j, access_y, access_x_0),
+                                  input_t(i, j, access_y, access_x_1),
+                                  input_t(i, j, access_y, access_x_2),
+                                  input_t(i, j, access_y, access_x_3), x_t);
+            } else {
+              coefficients[ii] =
+                  cubic_interp<T>(input_t(i, access_y, access_x_0, j),
+                                  input_t(i, access_y, access_x_1, j),
+                                  input_t(i, access_y, access_x_2, j),
+                                  input_t(i, access_y, access_x_3, j), x_t);
+            }
+          }
+
+          // interp y direction
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) =
+                cubic_interp<T>(coefficients[0], coefficients[1],
+                                coefficients[2], coefficients[3], y_t);
+          } else {
+            output_t(i, k, l, j) =
+                cubic_interp<T>(coefficients[0], coefficients[1],
+                                coefficients[2], coefficients[3], y_t);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolateGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int n, const int c, const int out_h,
+    const int out_w, const bool align_corners, const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          if (data_layout == DataLayout::kNCHW) {
+            input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
+          } else {
+            input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolationGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int in_h, const int in_w, const int n,
+    const int c, const int out_h, const int out_w, const bool align_corners,
+    const int align_mode, const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+
+    for (int l = 0; l < out_w; l++) {
+      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                           : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
+      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+      float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+      idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+      float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+      float d_e = 1.f - d_w;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bilinear interpolation grad
+          if (data_layout == DataLayout::kNCHW) {
+            const T grad = output_grad_t(i, j, k, l);
+            input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
+          } else {
+            const T grad = output_grad_t(i, k, l, j);
+            input_grad_t(i, y_n, x_w, j) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, y_s, x_w, j) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, y_n, x_e, j) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, y_s, x_e, j) += static_cast<T>(grad * d_n * d_w);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void TrilinearInterpolationGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
+    const int in_w, const int n, const int c, const int out_d, const int out_h,
+    const int out_w, const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int j = 0; j < out_d; j++) {  // loop for D
+    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * j);
+    t_f = (t_f > 0) ? t_f : 0;
+    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
+    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
+    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_b = 1.f - d_f;
+
+    for (int k = 0; k < out_h; k++) {  // loop for H
+      int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                           : static_cast<int>(ratio_h * k);
+      y_n = (y_n > 0) ? y_n : 0;
+      int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+      float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+      idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+      float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+      float d_s = 1.f - d_n;
+
+      for (int l = 0; l < out_w; l++) {  // loop for W
+        int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                             : static_cast<int>(ratio_w * l);
+        x_w = (x_w > 0) ? x_w : 0;
+        int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+        float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+        idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+        float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+        float d_e = 1.f - d_w;
+
+        for (int b = 0; b < n; b++) {    // loop for batches
+          for (int i = 0; i < c; i++) {  // loop for channels
+            // trilinear interpolation grad
+            if (data_layout == DataLayout::kNCHW) {
+              const T grad = output_grad_t(b, i, j, k, l);
+              input_grad_t(b, i, t_f, y_n, x_w) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, i, t_f, y_n, x_e) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, i, t_f, y_s, x_w) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, i, t_f, y_s, x_e) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, i, t_b, y_n, x_w) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, i, t_b, y_n, x_e) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, i, t_b, y_s, x_w) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, i, t_b, y_s, x_e) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            } else {
+              const T grad = output_grad_t(b, j, k, l, i);
+              input_grad_t(b, t_f, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, t_f, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, t_f, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, t_f, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, t_b, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, t_b, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, t_b, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, t_b, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BicubicInterpolationGrad(const Tensor& output_grad,
+                                     Tensor* input_grad, const float ratio_h,
+                                     const float ratio_w, const int in_h,
+                                     const int in_w, const int n, const int c,
+                                     const int out_h, const int out_w,
+                                     const bool align_corners,
+                                     const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    T y_n = align_corners ? static_cast<T>(ratio_h * k)
+                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    int input_y = floorf(y_n);
+    T y_t = y_n - input_y;
+
+    for (int l = 0; l < out_w; l++) {
+      T x_n = align_corners ? static_cast<T>(ratio_w * l)
+                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      int input_x = floorf(x_n);
+      T x_t = x_n - input_x;
+
+      T x_coeffs[4];
+      T y_coeffs[4];
+
+      get_cubic_upsample_coefficients<T>(x_coeffs, x_t);
+      get_cubic_upsample_coefficients<T>(y_coeffs, y_t);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bicubic interpolation grad
+          for (int ii = 0; ii < 4; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1),
+                                      static_cast<int>(0));
+              int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1),
+                                      static_cast<int>(0));
+              if (data_layout == DataLayout::kNCHW) {
+                T grad = output_grad_t(i, j, k, l);
+                input_grad_t(i, j, access_y, access_x) +=
+                    grad * y_coeffs[jj] * x_coeffs[ii];
+              } else {
+                T grad = output_grad_t(i, k, l, j);
+                input_grad_t(i, access_y, access_x, j) +=
+                    grad * y_coeffs[jj] * x_coeffs[ii];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  } else {
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale_w = scale_data[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    } else {
+      if (scale.size() > 0) {
+        scale_w = scale[0];
+
+        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                                 "scale  of Op(interpolate) "
+                                                 "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0.) {
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_w = out_size_data[0];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_w};
+  } else {
+    dim_out = {n, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  if ("linear" == interp_method) {
+    LinearInterpolation<T>(input, output, ratio_w, in_w, n, c, out_w,
+                           align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_h = scale[0];
+        scale_w = scale[1];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_h > 0. && scale_w > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("bilinear" == interp_method) {
+    BilinearInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
+                             out_h, out_w, align_corners, align_mode,
+                             data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighborInterpolate<T>(input, output, ratio_h, ratio_w, n, c, out_h,
+                                  out_w, align_corners, data_layout);
+  } else if ("bicubic" == interp_method) {
+    BicubicInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
+                            out_h, out_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  } else {
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_d = scale_data[0];
+        scale_h = scale_data[1];
+        scale_w = scale_data[2];
+      } else {
+        scale_d = scale_data[0];
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_d = scale[0];
+        scale_h = scale[1];
+        scale_w = scale[2];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0 && scale_d, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
+      out_d = static_cast<int>(in_d * scale_d);
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_d = out_size_data[0];
+      out_h = out_size_data[1];
+      out_w = out_size_data[2];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
+                                  "out_d in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("trilinear" == interp_method) {
+    TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
+                              in_h, in_w, n, c, out_d, out_h, out_w,
+                              align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_w = -1.0;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    scale_w = scale_data[0];
+    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                             "scale  of Op(interpolate) "
+                                             "should be greater than 0."));
+  } else {
+    if (scale.size() > 0) {
+      scale_w = scale[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0.) {
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_w = out_size_data[0];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_w};
+  } else {
+    dim_grad = {n, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  if ("linear" == interp_method) {
+    LinearInterpolationGrad<T>(output_grad, input_grad, ratio_w, in_w, n, c,
+                               out_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_h = scale_data[0];
+      scale_w = scale_data[1];
+    } else {
+      scale_w = scale_data[0];
+      scale_h = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_h = scale[0];
+      scale_w = scale[1];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_h > 0. && scale_w > 0.) {
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_h = out_size_data[0];
+    out_w = out_size_data[1];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("bilinear" == interp_method) {
+    BilinearInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
+                                 in_h, in_w, n, c, out_h, out_w, align_corners,
+                                 align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighborInterpolateGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
+                                      n, c, out_h, out_w, align_corners,
+                                      data_layout);
+  } else if ("bicubic" == interp_method) {
+    BicubicInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w, in_h,
+                                in_w, n, c, out_h, out_w, align_corners,
+                                data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_d = scale_data[0];
+      scale_h = scale_data[1];
+      scale_w = scale_data[2];
+    } else {
+      scale_d = scale_data[0];
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+    out_d = static_cast<int>(in_d * scale_d);
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_d = out_size_data[0];
+    out_h = out_size_data[1];
+    out_w = out_size_data[2];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("trilinear" == interp_method) {
+    TrilinearInterpolationGrad<T>(
+        output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n,
+        c, out_d, out_h, out_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+class InterpolateV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    if (input_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCPUFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCPUFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCPUFwd<T>(ctx, *input, output);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+    if (output_grad_dims.size() == 3) {  // 1D interpolation grad
+      Interpolate1DCPUBwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 4) {  // 2D interpolation grad
+      Interpolate2DCPUBwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 5) {  // 3D interpolation grad
+      Interpolate3DCPUBwd<T>(ctx, input_grad, *output_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72da43e3bc63c1c585fe19d703892c23ce7b0ec2
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_v2_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+class OverflowV2Op : public framework::OperatorWithKernel {
+ public:
+  OverflowV2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
+    UnaryOpUnchangedInferShape(ctx);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    int dtype = -1;
+    auto *x_var = ctx.InputVar("X");
+    if (x_var->IsType<framework::LoDTensor>()) {
+      dtype = x_var->Get<framework::LoDTensor>().type();
+    } else if (x_var->IsType<framework::SelectedRows>()) {
+      dtype = x_var->Get<framework::SelectedRows>().value().type();
+    } else {
+      PADDLE_THROW(plat::errors::InvalidArgument(
+          "Cannot find the input data type by all input data"));
+    }
+    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
+                                   ctx.GetPlace());
+  }
+};
+
+class OverflowV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensors of overflowv2 operator.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of overflowv2 operator. "
+              "Same size compare to input tensor");
+    AddComment(string::Sprintf(R"DOC(
+Overflow %s operator.
+
+$$Out = %s(X)$$
+
+Check whether each element of X is Inf or Nan, return the bool result of each
+element of X as a tensor.
+
+%s
+)DOC",
+                               GetName(), GetComments()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetComments() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_V2OP_MAKER(op_type, comment)                         \
+  namespace paddle {                                                  \
+  namespace operators {                                               \
+  class _##op_type##OverflowV2OpMaker                                 \
+      : public ::paddle::operators::OverflowV2OpMaker {               \
+   protected:                                                         \
+    std::string GetName() const { return #op_type; }                  \
+    std::string GetComments() const { return comment; }               \
+  };                                                                  \
+  }                                                                   \
+  }                                                                   \
+  REGISTER_OPERATOR(                                                  \
+      op_type, ops::OverflowV2Op, ops::_##op_type##OverflowV2OpMaker, \
+      paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
+      paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CPU_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,  \
+                                   ops::functor>,                            \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,       \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,         \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,        \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16, \
+                          ops::functor>);
+
+REGISTER_V2OP_MAKER(isinf_v2, "isinfv2(X)");
+REGISTER_V2OP_MAKER(isnan_v2, "isnanv2(X)");
+REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
+
+REGISTER_OVERFLOW_CPU_KERNEL(isinf_v2, InfinityV2Functor);
+REGISTER_OVERFLOW_CPU_KERNEL(isnan_v2, NANV2Functor);
+REGISTER_OVERFLOW_CPU_KERNEL(isfinite_v2, IsfiniteV2Functor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a6d818d0501e60dfffc8995075bb7f0369788fd
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_v2_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CUDA_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,  \
+                                   ops::functor>,                             \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,       \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,         \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,        \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
+                          ops::functor>);
+
+REGISTER_OVERFLOW_CUDA_KERNEL(isinf_v2, InfinityV2Functor);
+REGISTER_OVERFLOW_CUDA_KERNEL(isnan_v2, NANV2Functor);
+REGISTER_OVERFLOW_CUDA_KERNEL(isfinite_v2, IsfiniteV2Functor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f0aa63ce80248ee9f7839890f611b9d5293789e
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+struct InfinityV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsInfV2(tensor, out);
+  }
+};
+
+struct NANV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsNANV2(tensor, out);
+  }
+};
+
+struct IsfiniteV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorIsfiniteV2(tensor, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 0a7146be83dcb673573f1fdcb94ed2d2c57bd2c3..2c3172d2a1112e2c79a3c1215ccd0d3f08d59451 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -53,11 +53,9 @@ class LinspaceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Start"),
-        ctx.device_context(), layout_, library_);
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
@@ -73,6 +71,7 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Num",
              "Number of entry in the sequence. It is a tensor of shape [1], "
              "should be of type int32.");
+    AddAttr<int>("dtype", "The output data type.");
     AddOutput("Out", "A sequence of numbers.");
     AddComment(R"DOC(
     Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
@@ -85,4 +84,6 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
 REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
+                       ops::CPULinspaceKernel<int32_t>,
+                       ops::CPULinspaceKernel<int64_t>,
                        ops::CPULinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 47d4536dcfe2a0ab43b3584196a138214e438e3e..8aca892a81d41b1e0a9f7f9c14169c2817ae9452 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -20,13 +20,15 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+__global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) {
+    out[index] = static_cast<T>(start + step * index);
+  }
 }
 
 template <typename T>
 __global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = start;
+  out[0] = static_cast<T>(start);
 }
 
 template <typename T>
@@ -51,9 +53,9 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     out->Resize(framework::make_ddim({num}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
-    T step = 0;
+    double step = 0;
     if (num != 1) {
-      step = (stop - start) / (num - 1);
+      step = (static_cast<double>(stop - start)) / (num - 1);
     }
 
     auto stream = context.cuda_device_context().stream();
@@ -68,4 +70,6 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
+                        ops::CUDALinspaceKernel<int32_t>,
+                        ops::CUDALinspaceKernel<int64_t>,
                         ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index b1fcac73b0ad249aa19859bde770a8554cdb7408..9fb4960375ed7be60598d558c65310bd4a4b84bc 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -35,14 +35,12 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     if (num > 1) {
-      T step = (stop - start) / (num - 1);
-      T value = start;
+      double step = (static_cast<double>(stop - start)) / (num - 1);
       for (int i = 0; i < num; ++i) {
-        out_data[i] = value;
-        value += step;
+        out_data[i] = static_cast<T>(start + step * i);
       }
     } else {
-      out_data[0] = start;
+      out_data[0] = static_cast<T>(start);
     }
   }
 };
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6e2b3ecff8c83e47a9016cc3d233d1aa03fb52b
--- /dev/null
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/log_softmax_op.h"
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+namespace paddle {
+namespace operators {
+
+class LogSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class LogSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of softmax, "
+             "whose dimension :attr:`axis` is the input_feature_dimensions.");
+    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddAttr<int>("axis",
+                 "The dimension index of Input(x) to perform log_softmax,"
+                 "default -1 for last dimension")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+LogSoftmax Operator.
+
+)DOC");
+  }
+};
+
+class LogSoftmaxOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+    return m;
+  }
+};
+
+class LogSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "log_softmax_grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@grad", "log_softmax_grad");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        platform::errors::InvalidArgument("Input(Out) and its gradients "
+                                          "should have the same shape."));
+
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("log_softmax_grad");
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker,
+                  ops::LogSoftmaxOpInferVarType,
+                  ops::LogSoftmaxGradOpMaker<paddle::framework::OpDesc>,
+                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    log_softmax,
+    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    log_softmax_grad,
+    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..02fca246d241d476b5540a6af8f49b16d4dae416
--- /dev/null
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/log_softmax_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b983ac54157d9d0679ac237ca94e742b38833864
--- /dev/null
+++ b/paddle/fluid/operators/log_softmax_op.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, const framework::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, const framework::DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = static_cast<T>(-64.);
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct LogSoftmaxFunctor {
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y, const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    int axis_dim = X->dims()[axis];
+    const int n = SizeToAxis(axis, X->dims());
+    const int d = SizeFromAxis(axis, X->dims());
+    framework::DDim dim_2d{n, d};
+
+    auto logits = EigenMatrix<T>::From(*X, dim_2d);
+    auto log_softmax = EigenMatrix<T>::From(*Y, dim_2d);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      log_softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .eval()
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      log_softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .eval()
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<T>());
+    }
+
+    log_softmax.device(*context.eigen_device()) =
+        log_softmax -
+        log_softmax.exp()
+            .eval()
+            .reshape(batch_axis_remain)
+            .sum(along_axis)
+            .log()
+            .broadcast(one_axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Output<framework::Tensor>("Out");
+    const int rank = X->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    Out->mutable_data<T>(context.GetPlace());
+
+    LogSoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Out, axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct LogSoftmaxGradFunctor {
+  void operator()(const DeviceContext& context, const framework::Tensor* Y,
+                  const framework::Tensor* dY, framework::Tensor* dX,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int n = SizeToAxis(axis, Y->dims());
+    const int d = SizeFromAxis(axis, Y->dims());
+    framework::DDim dim_2d{n, d};
+
+    auto y = EigenMatrix<T>::From(*Y, dim_2d);
+    auto dy = EigenMatrix<T>::From(*dY, dim_2d);
+    auto dx = EigenMatrix<T>::From(*dX, dim_2d);
+
+    const int axis_dim = Y->dims()[axis];
+    const int batch_size = y.dimension(kBatchDim);
+    const int num_classes = y.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    dx.device(*context.eigen_device()) =
+        dy -
+        (y.exp()) * (dy.reshape(batch_axis_remain)
+                         .sum(along_class)
+                         .broadcast(one_axis));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const int rank = Out->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
+
+    LogSoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Out, dOut, dX, axis);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b44c02757fae9648a7e660a06c03af45d621e02
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/masked_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class MaskedSelectOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
+    framework::DDim output_dims(ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Y", output_dims);
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class MaskedSelectOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Mask",
+             "The mask of Input Tensor to be selected which is a bool Tensor.");
+    AddOutput(
+        "Y",
+        "The returned tensor, the data type "
+        "is same as input, will be on the same device with the input Tensor.");
+    AddComment(R"DOC(
+Size Operator.
+
+Return a new 0-D tensor which indexes the indexed tensor according
+the mask which is a tensor withe data type bool.
+)DOC");
+  }
+};
+
+class MaskedSelectOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Input",
+                   "Input", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Y")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class MaskedSelectGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("masked_select_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Mask", this->Input("Mask"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer,
+                                    "X");
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
+                  ops::MaskedSelectGradOpMaker<paddle::framework::OpDesc>,
+                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
+                  ops::MaskedSelectedGradNoNeedBufferVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    masked_select,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    masked_select_grad,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/masked_select_op.cu b/paddle/fluid/operators/masked_select_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7dc0516800c483d1d82a2390a64130e77b1efb01
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.cu
@@ -0,0 +1,179 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#include "paddle/fluid/operators/masked_select_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx])
+      mask_array[idx] = 1;
+    else
+      mask_array[idx] = 0;
+  }
+}
+
+template <typename T>
+__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
+                                     const bool* mask, const T* input, T* out,
+                                     int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[index] = input[idx];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
+                                         const bool* mask, const T* input,
+                                         T* out, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[idx] = input[index];
+    } else {
+      out[idx] = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MaskedSelectCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto input = ctx.Input<framework::Tensor>("X");
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>("Y");
+    auto* mask_data = mask->data<bool>();
+    auto input_data = input->data<T>();
+
+    auto mask_size = mask->numel();
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim, mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim, mask_dim));
+
+    thrust::device_ptr<const bool> mask_dev_ptr =
+        thrust::device_pointer_cast(mask_data);
+    thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
+    auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
+
+    framework::DDim out_dim{out_size};
+    out->Resize(out_dim);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor mask_array;
+    Tensor mask_prefix_sum;
+    mask_array.Resize(mask_dim);
+    mask_prefix_sum.Resize(mask_dim);
+
+    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
+    int32_t* mask_prefix_sum_data =
+        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
+    int threads = 512;
+    int grid = (mask_size + threads - 1) / threads;
+    auto stream = ctx.cuda_device_context().stream();
+    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
+                                               mask_size);
+
+    thrust::device_ptr<int32_t> mask_array_dev_ptr =
+        thrust::device_pointer_cast(mask_array_data);
+    thrust::device_vector<int32_t> mask_array_vec(
+        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
+    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
+                           mask_array_vec.end(), mask_prefix_sum_data);
+
+    SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaskedSelectGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto input = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* mask_data = mask->data<bool>();
+    auto* input_data = input->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    auto input_size = input->numel();
+    auto mask_size = mask->numel();
+    auto mask_dim = mask->dims();
+
+    auto out_size = mask_size;
+
+    Tensor mask_array;
+    Tensor mask_prefix_sum;
+    mask_array.Resize(mask_dim);
+    mask_prefix_sum.Resize(mask_dim);
+
+    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
+    int32_t* mask_prefix_sum_data =
+        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
+    int threads = 512;
+    int grid = (mask_size + threads - 1) / threads;
+    auto stream = ctx.cuda_device_context().stream();
+    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
+                                               mask_size);
+
+    thrust::device_ptr<int32_t> mask_array_dev_ptr =
+        thrust::device_pointer_cast(mask_array_data);
+    thrust::device_vector<int32_t> mask_array_vec(
+        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
+    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
+                           mask_array_vec.end(), mask_prefix_sum_data);
+
+    SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    masked_select,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    masked_select_grad,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/masked_select_op.h b/paddle/fluid/operators/masked_select_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8371556c82fe105b6719e845d4fd6232f3a95e
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class MaskedSelectKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto input = context.Input<framework::Tensor>("X");
+    auto mask = context.Input<framework::Tensor>("Mask");
+    auto out = context.Output<framework::Tensor>("Y");
+    auto* mask_data = mask->data<bool>();
+    auto input_data = input->data<T>();
+
+    auto mask_size = mask->numel();
+
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim, mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim, mask_dim));
+
+    int out_size = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) out_size++;
+    }
+
+    framework::DDim out_dim{out_size};
+    out->Resize(out_dim);
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+
+    int index = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) {
+        out_data[index] = input_data[i];
+        index++;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaskedSelectGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto out = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto mask = context.Input<framework::Tensor>("Mask");
+    auto input = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+
+    auto* mask_data = mask->data<bool>();
+    auto* input_data = input->data<T>();
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+    int mask_size = mask->numel();
+
+    int index = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) {
+        out_data[i] = input_data[index];
+        index++;
+      } else {
+        out_data[i] = 0;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index f8c971954fc4c0b367cc6e62df8f7a596b651b94..42a60e9220cf848ba766a19cb7b4d13edc460c11 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -198,6 +198,11 @@ class Blas {
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
                    int batchCount, int64_t strideA, int64_t strideB) const;
 
+  template <typename T>
+  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
+                   int K, T alpha, const T** A, const T** B, T beta, T** C,
+                   int batchCount) const;
+
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 64b35cfeaecd1f88395db97d0374d919356651eb..d0c5f74d4efb8248b41d8b2af285e8dd7ec4d479 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -458,6 +458,17 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index cdaf53fea30085b34f07c37d50455c9b02dc5c44..892bf15738141bfbb7e75fa6b37c0cda53a8e098 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <vector>
@@ -655,6 +656,26 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
+#ifdef PADDLE_WITH_MKLML
+  const int lda = std::max((transA == CblasNoTrans) ? K : M, 1);
+  const int ldb = std::max((transB == CblasNoTrans) ? N : K, 1);
+  const int ldc = std::max(N, 1);
+  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, A,
+                       &lda, B, &ldb, &beta, C, &ldc, 1 /* group_count */,
+                       &batchCount);
+#else
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+#endif
+}
+
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
 template <>
 template <typename T>
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 44b04104419e790b0ca8619b85ec0a1b4d701021..6748d0ab43f70f997b3008f34f4be743b81e8946 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -73,6 +73,13 @@ struct TensorSetConstantCPU {
   float value_;
 };
 
+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index b967dd2cfda80358b863dfcd986d4ad7104a9ac0..22164131468a46bffc239509a9213a21f1611ed5 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -111,12 +111,11 @@ __global__ void KernelPool2DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
+      phstart = AdaptStartIndex(h_offset, output_height, input_height);
+      phend = AdaptEndIndex(h_offset, output_height, input_height);
+
+      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
+      pwend = AdaptEndIndex(w_offset, output_width, input_width);
     } else {
       phstart = (h_offset < ksize_height)
                     ? 0
@@ -159,6 +158,7 @@ __global__ void KernelPool2DGrad(
           pool_size = exclusive ? (hend - hstart) * (wend - wstart)
                                 : ksize_height * ksize_width;
         }
+
         int output_sub_idx = channel_last
                                  ? (ph * output_width + pw) * channels + offsetC
                                  : ph * output_width + pw;
@@ -689,15 +689,14 @@ __global__ void KernelPool3DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      pdstart = d_offset * output_depth / input_depth;
-      pdend =
-          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
+      pdstart = AdaptStartIndex(d_offset, output_depth, input_depth);
+      pdend = AdaptEndIndex(d_offset, output_depth, input_depth);
+
+      phstart = AdaptStartIndex(h_offset, output_height, input_height);
+      phend = AdaptEndIndex(h_offset, output_height, input_height);
+
+      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
+      pwend = AdaptEndIndex(w_offset, output_width, input_width);
     } else {
       pdstart = (d_offset < ksize_depth)
                     ? 0
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 238d9f2905058d267ffbee0669594920d7a9e031..86feaa72d5fa69cd5d76e56182c27b8d048e4c74 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <queue>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/generator.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +32,12 @@ UniformSampler::UniformSampler(int64_t range, unsigned int seed)
   dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 }
 
-int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
+int64_t UniformSampler::Sample() const {
+  return framework::Generator::GetInstance()->is_init_py
+             ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+             : (*dist_)(*random_engine_);
+  // return (*dist_)(*random_engine_);
+}
 
 float UniformSampler::Probability(int64_t value) const { return inv_range_; }
 
@@ -46,8 +52,11 @@ int64_t LogUniformSampler::Sample() const {
   // inverse_transform_sampling method
   // More details:
   // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  const int64_t value =
-      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+  auto cur_random =
+      framework::Generator::GetInstance()->is_init_py
+          ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+          : (*dist_)(*random_engine_);
+  const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1;
   // Mathematically, value should be <= range_, but might not be due to some
   // floating point roundoff, so we mod by range_.
   return value % range_;
@@ -75,8 +84,14 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
 }
 
 int64_t CustomSampler::Sample() const {
-  auto index = (*int_dist_)(*random_engine_);
-  auto p = (*real_dist_)(*random_engine_);
+  auto index =
+      framework::Generator::GetInstance()->is_init_py
+          ? (*int_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+          : (*int_dist_)(*random_engine_);
+  auto p =
+      framework::Generator::GetInstance()->is_init_py
+          ? (*real_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+          : (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
     int alias = alias_[index];
 
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0254ad0a563d91282e76cd7bf43343e4d9139842
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -0,0 +1,176 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class MatMulV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
+    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    std::vector<int64_t> dims_x =
+        paddle::framework::vectorize(ctx->GetInputDim("X"));
+    std::vector<int64_t> dims_y =
+        paddle::framework::vectorize(ctx->GetInputDim("Y"));
+    auto ndims_x = dims_x.size();
+    auto ndims_y = dims_y.size();
+
+    bool x_broadcasted = false, y_broadcasted = false;
+    if (ndims_x == 1) {
+      dims_x.insert(dims_x.begin(), 1);
+      ndims_x = 2;
+      x_broadcasted = true;
+    }
+
+    if (ndims_y == 1) {
+      dims_y.push_back(1);
+      ndims_y = 2;
+      y_broadcasted = true;
+    }
+
+    size_t M, N;
+    if (trans_x) {
+      M = dims_x[ndims_x - 1];
+    } else {
+      M = dims_x[ndims_x - 2];
+    }
+    if (trans_y) {
+      N = dims_y[ndims_y - 2];
+    } else {
+      N = dims_y[ndims_y - 1];
+    }
+
+    std::vector<int64_t> new_dims;
+    if (ndims_x >= ndims_y) {
+      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+    } else {
+      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+    }
+    if (!x_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!y_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (x_broadcasted && y_broadcasted) {
+      new_dims.push_back(1);
+    }
+
+    auto out_dims = framework::make_ddim(new_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "tensor of shape (d0, d1 ... M, K)");
+    AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
+    AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
+    AddAttr<bool>("trans_x",
+                  "Set true to transpose the last two dimensions of X before "
+                  "doing multiplication")
+        .SetDefault(false);
+    AddAttr<bool>("trans_y",
+                  "Set true to transpose the last two dimensions of Y before "
+                  "doing multiplication")
+        .SetDefault(false);
+    AddComment(
+        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), 
+        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). 
+        In addition, it also follows the broadcast rule which is similar as
+        numpy.matmul.
+)DOC");
+  }
+};
+
+class MatMulV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul_v2");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul_v2");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "matmul_v2");
+    auto x_dims = context->GetInputDim("X");
+    auto y_dims = context->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(y_grad_name)) {
+      context->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+template <typename T>
+class MatMulV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("matmul_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
+                  ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2, ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2_grad,
+    ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64ec65a23419725c7cc481beadb9383402a426bd
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+namespace ops = paddle::operators;
+namespace plf = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(matmul_v2,
+                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc83e4d964815ec46452bb0086cf17437b3846a4
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -0,0 +1,481 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/dot_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+
+#ifdef __NVCC__
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+};
+
+template <typename DeviceContext, typename T>
+void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
+                            const std::vector<int>& reduce_dims,
+                            const paddle::framework::ExecutionContext& ctx) {
+  if (reduce_dims.empty()) {
+    // FIXME maybe reduce this copy operation
+    framework::TensorCopySync(*input, ctx.GetPlace(), output);
+    return;
+  }
+#ifdef __NVCC__
+  auto stream = ctx.cuda_device_context().stream();
+  TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+      *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+      IdentityFunctor<T>(), stream);
+#else
+  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
+      input, output, reduce_dims, true, false, ctx)
+      .template apply<T>();
+#endif
+}
+
+static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
+                                 const int y_ndim, const std::int64_t* y_dims,
+                                 std::int64_t* x_bd_dims,
+                                 std::int64_t* y_bd_dims,
+                                 std::int64_t* out_bd_dims) {
+  const int ndim = std::max(x_ndim, y_ndim);
+  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
+  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
+  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
+  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    PADDLE_ENFORCE_EQ(
+        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
+        true, platform::errors::InvalidArgument(
+                  "Input(X) and Input(Y) has error dim."));
+    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
+      out_bd_dims[i] = 0;
+    } else {
+      out_bd_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
+    }
+  }
+}
+
+static int64_t GetIndexMessage(const int n, const int64_t* dims,
+                               const int64_t* index) {
+  int64_t sum = 0;
+  for (int i = 0; i < n; ++i) {
+    if (dims[i] > 1) {
+      sum = sum * dims[i] + index[i];
+    }
+  }
+  return sum;
+}
+
+static void IndexIncreaseFromDims(const int ndim, const int64_t* dims,
+                                  int64_t* index) {
+  for (int i = ndim - 1; i >= 0; --i) {
+    ++index[i];
+    if (index[i] >= dims[i]) {
+      index[i] -= dims[i];
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const Tensor* X, const Tensor* Y,
+                    const std::vector<std::int64_t>& x_dims,
+                    const std::vector<std::int64_t>& y_dims, Tensor* Out,
+                    bool trans_x, bool trans_y,
+                    const paddle::framework::ExecutionContext& ctx) {
+  const int x_ndim = x_dims.size();
+  const int y_ndim = y_dims.size();
+
+  // get data ptr
+  const T* x_data = X->data<T>();
+  const T* y_data = Y->data<T>();
+
+  if (x_ndim == 1 && y_ndim == 1) {
+    PADDLE_ENFORCE_EQ(X->numel(), Y->numel(),
+                      platform::errors::InvalidArgument(
+                          "X's numbers is not equal to Y's numbers,"
+                          "when X/Y's dims =1"));
+    VLOG(3) << "MatMul's case 1";
+    Out->Resize({1});
+    Out->mutable_data<T>(ctx.GetPlace());
+    auto out_eigen = framework::EigenScalar<T>::From(*Out);
+    auto x_eigen = framework::EigenVector<T>::Flatten(*X);
+    auto y_eigen = framework::EigenVector<T>::Flatten(*Y);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out_eigen.device(dev) = (x_eigen * y_eigen).sum();
+    return;
+  }
+
+  auto& dev_ctx = ctx.template device_context<DeviceContext>();
+  auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+  if (x_ndim == 1) {
+    const int N = X->numel();
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(y_ndim - 1);
+    if (trans_y) {
+      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
+    } else {
+      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
+      out_dims.back() = y_dims.back();
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+    if (trans_y) {
+      const int M = Y->numel() / N;
+      VLOG(3) << "MatMul's case 2";
+      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+    } else {
+      const int M = y_dims[y_ndim - 1];
+      const int batch_size = Y->numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 3";
+        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 4";
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
+                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+      }
+    }
+    return;
+  }
+
+  if (y_ndim == 1) {
+    const int N = Y->numel();
+    if (trans_x) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(x_ndim - 1);
+    if (trans_x) {
+      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
+      out_dims.back() = x_dims.back();
+    } else {
+      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+
+    if (trans_x) {
+      const int M = x_dims[x_ndim - 1];
+      const int batch_size = X->numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 5";
+        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 6";
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
+                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+      }
+    } else {
+      const int M = X->numel() / N;
+      VLOG(3) << "MatMul's case 7";
+      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+    }
+    return;
+  }
+
+  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
+  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+  if (trans_y) {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  } else {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  }
+  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
+  const int ndim = std::max(x_ndim, y_ndim);
+  std::vector<std::int64_t> x_broadcast_dims(ndim);
+  std::vector<std::int64_t> y_broadcast_dims(ndim);
+  std::vector<std::int64_t> out_broadcast_dims(ndim);
+
+  GetBroadcastFromDims(x_ndim - 2, x_dims.data(), y_ndim - 2, y_dims.data(),
+                       x_broadcast_dims.data(), y_broadcast_dims.data(),
+                       out_broadcast_dims.data());
+
+  out_broadcast_dims[ndim - 2] = M;
+  out_broadcast_dims[ndim - 1] = N;
+
+  Out->Resize(framework::make_ddim(out_broadcast_dims));
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  const int batch_dim = ndim - 2;
+  // broadcast message
+  const bool is_broadcast_dims = !std::equal(
+      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim,
+      y_broadcast_dims.cbegin());
+
+  const std::int64_t x_batch_size = std::accumulate(
+      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  const std::int64_t y_batch_size = std::accumulate(
+      y_broadcast_dims.cbegin(), y_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  const std::int64_t out_batch_size = std::accumulate(
+      out_broadcast_dims.cbegin(), out_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  if (out_batch_size == 0) return;
+  if (x_batch_size == 1 && y_batch_size == 1) {
+    VLOG(3) << "MatMul's case 8";
+    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+              y_data, 0.0f, Out->data<T>());
+  } else if (x_batch_size == 1) {
+    if (M == 1 && trans_y) {
+      VLOG(3) << "MatMul's case 9";
+      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
+                Out->data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 10";
+      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
+                       K * N);
+    }
+  } else if (y_batch_size == 1) {
+    if (!trans_x) {
+      VLOG(3) << "MatMul's case 11";
+      blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
+                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
+                Out->data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 12";
+      blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
+                       M * K, 0);
+    }
+  } else if (!is_broadcast_dims) {
+    VLOG(3) << "MatMul's case 13";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+  } else {
+    // in the case, can't use stridedgemm
+    std::vector<const T*> x_ptr(out_batch_size);
+    std::vector<const T*> y_ptr(out_batch_size);
+    std::vector<T*> out_ptr(out_batch_size);
+    std::vector<std::int64_t> index(batch_dim, 0);
+    for (std::int64_t i = 0; i < out_batch_size; ++i) {
+      // using the index to get offset
+      const std::int64_t x_index =
+          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
+      const std::int64_t y_index =
+          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
+
+      x_ptr[i] = x_data + x_index * M * K;
+      y_ptr[i] = y_data + y_index * K * N;
+      out_ptr[i] = Out->data<T>() + i * M * N;
+      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
+    }
+    VLOG(3) << "MatMul's case 14";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
+                     out_batch_size);
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const Tensor* X, const Tensor* Y, Tensor* Out, bool trans_x,
+                    bool trans_y,
+                    const paddle::framework::ExecutionContext& ctx) {
+  const std::vector<std::int64_t> x_dims = vectorize(X->dims());
+  const std::vector<std::int64_t> y_dims = vectorize(Y->dims());
+  MatMulFunction<DeviceContext, T>(X, Y, x_dims, y_dims, Out, trans_x, trans_y,
+                                   ctx);
+}
+
+template <typename DeviceContext, typename T>
+class MatMulV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* Out = ctx.Output<Tensor>("Out");
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+    MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    // get dims
+    std::vector<std::int64_t> x_dims = vectorize(X->dims());
+    std::vector<std::int64_t> y_dims = vectorize(Y->dims());
+    std::vector<std::int64_t> dout_dims = vectorize(dOut->dims());
+
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int ndim = dout_dims.size();
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    // x's or y's dim = 1
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dx) dx->mutable_data<T>(ctx.GetPlace());
+      if (dy) dy->mutable_data<T>(ctx.GetPlace());
+      if (dOut->numel() == 1) {
+        DotGradFunction<DeviceContext, T>(X, Y, dOut, dx, dy, ctx);
+        return;
+      }
+    }
+    // It is very tricky. For this broadcast, currently using the reduce sum to
+    // get gradient.
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin() + 0, 1);
+      x_ndim += 1;
+      if (trans_x)
+        dout_dims.push_back(1);
+      else
+        dout_dims.insert(dout_dims.begin() + ndim - 1, 1);
+      ndim += 1;
+    }
+
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      y_ndim += 1;
+      if (trans_y)
+        dout_dims.insert(dout_dims.begin() + ndim - 1, 1);
+      else
+        dout_dims.push_back(1);
+      ndim += 1;
+    }
+
+    // the normal case
+    Tensor dx_help, dy_help;
+    if (trans_x) {
+      if (trans_y) {
+        // X'Y': dA = Y'G', dB = G'X'
+        if (dx)
+          MatMulFunction<DeviceContext, T>(Y, dOut, y_dims, dout_dims, &dx_help,
+                                           true, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(dOut, X, dout_dims, x_dims, &dy_help,
+                                           true, true, ctx);
+      } else {
+        // X'Y: dX = YG', dY = XG
+        if (dx)
+          MatMulFunction<DeviceContext, T>(Y, dOut, y_dims, dout_dims, &dx_help,
+                                           false, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(X, dOut, x_dims, dout_dims, &dy_help,
+                                           false, false, ctx);
+      }
+    } else {
+      if (trans_y) {
+        // XY': dX = GY, dY = G'X
+        if (dx)
+          MatMulFunction<DeviceContext, T>(dOut, Y, dout_dims, y_dims, &dx_help,
+                                           false, false, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(dOut, X, dout_dims, x_dims, &dy_help,
+                                           true, false, ctx);
+      } else {
+        // XY: dX = GY', dY = X'G
+        if (dx)
+          MatMulFunction<DeviceContext, T>(dOut, Y, dout_dims, y_dims, &dx_help,
+                                           false, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(X, dOut, x_dims, dout_dims, &dy_help,
+                                           true, false, ctx);
+      }
+    }
+    // get help dims
+    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(dx_broadcast_dims.data(),
+              dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::fill(dy_broadcast_dims.data(),
+              dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(x_dims.data(), x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+    std::copy(y_dims.data(), y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // reduce sum to get grad by ReduceSum
+    if (dx) {
+      dx->Resize(dx_help.dims());
+      ReduceSumForMatmulGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
+                                               ctx);
+      dx->Resize(X->dims());
+    }
+    if (dy) {
+      dy->Resize(dy_help.dims());
+      ReduceSumForMatmulGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
+                                               ctx);
+      dy->Resize(Y->dims());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index aa9606b5f85896cf4905c53b655f894e6429fc9a..5ca9216d0c8d6b3f773a1eb1a0cec216ca6ed4f3 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -76,6 +76,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   // paddle uses beta but mkldnn uses alpha for swish
   if (algorithm == mkldnn::algorithm::eltwise_swish) {
     std::swap(alpha, beta);
+  } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
+    alpha = ctx.Attr<T>("threshold");
   }
 
   PADDLE_ENFORCE(
@@ -119,6 +121,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   // paddle uses beta but mkldnn uses alpha for swish
   if (algorithm == mkldnn::algorithm::eltwise_swish) {
     std::swap(alpha, beta);
+  } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
+    alpha = ctx.Attr<T>("threshold");
   }
 
   auto diff_dst_tz = framework::vectorize<int64_t>(diff_y->dims());
@@ -192,6 +196,10 @@ template <typename T>
 using ReluMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
 
+template <typename T>
+using Relu6MKLDNNFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_bounded_relu>;
+
 template <typename T>
 using SwishMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_swish>;
@@ -216,6 +224,10 @@ template <typename T>
 using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
 
+template <typename T>
+using Relu6MKLDNNGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_bounded_relu>;
+
 template <typename T>
 using SwishMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_swish>;
@@ -249,6 +261,7 @@ namespace ops = paddle::operators;
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                     \
   __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);          \
+  __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor);       \
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);    \
   __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor);          \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);       \
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 17e1e1958346155af32cf75b5e9fc25cdbdd91eb..7d99bb7d2b7a7049c67788df4c507afc14880815 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -72,7 +72,7 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
   return dst_dt;
 }
 
-template <typename T>
+template <typename T, typename K, typename T_out>
 class ConvMKLDNNHandlerT
     : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> {
  public:
@@ -227,7 +227,7 @@ class ConvMKLDNNHandlerT
           platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
                                   MKLDNNMemoryFormat::any);
       const auto dst_md = platform::MKLDNNMemDesc(
-          dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+          dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
 
       const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
                                          : mkldnn::prop_kind::forward_training;
@@ -313,29 +313,29 @@ class ConvMKLDNNHandlerT
     if (is_test && weights_mem_p) {
       return weights_mem_p;
     } else {
-      const T* filter_data = filter->data<T>();
+      const K* filter_data = filter->data<K>();
       auto weights_tz = framework::vectorize(filter->dims());
       GetWeightsTz(weights_tz, groups);
 
       auto user_src_md = platform::MKLDNNMemDesc(
-          weights_tz, platform::MKLDNNGetDataType<T>(),
+          weights_tz, platform::MKLDNNGetDataType<K>(),
           GetWeightsFormat(filter->format(), groups, is_conv3d));
 
       return this->AcquireMemoryWithReorder(
           user_src_md, this->fwd_pd_->weights_desc(),
-          to_void_cast<T>(filter_data), "@weights_mem_p", is_test);
+          to_void_cast<K>(filter_data), "@weights_mem_p", is_test);
     }
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
       const framework::Tensor* bias, const bool is_test) {
-    const T* bias_data = bias->data<T>();
+    const K* bias_data = bias->data<K>();
     auto user_bias_md = platform::MKLDNNMemDesc(
-        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<T>(),
+        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
         MKLDNNMemoryFormat::x);
 
     return this->AcquireMemoryWithReorder(
-        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<T>(bias_data),
+        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<K>(bias_data),
         "@bias_mem_p", is_test);
   }
 
@@ -358,14 +358,14 @@ class ConvMKLDNNHandlerT
     if (residual_param->format() !=
         platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
       auto residual_memory_p = this->AcquireResidualMemory(residual_param);
-      dst_memory_p = this->AcquireDstMemory(output);
+      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
       this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
     } else {
       // Changing ShareDataWith to TensorCopy results in performance drop
       // on ResNet architectures
       // (https://github.com/PaddlePaddle/Paddle/issues/22964)
       output->ShareDataWith(*residual_param);
-      dst_memory_p = this->AcquireDstMemory(output);
+      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
     }
     return dst_memory_p;
   }
@@ -381,7 +381,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
     if (!is_INT8) {
-      ComputeFP32(ctx);
+      ComputeFP32<float>(ctx);
     } else {
       std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
       bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
@@ -399,6 +399,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
   }
 
+  template <typename T_out>
   void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
@@ -414,7 +415,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    ConvMKLDNNHandlerT<T> handler(
+    ConvMKLDNNHandlerT<T, K, T_out> handler(
         ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
         output, ctx.InputName("Input") + ctx.InputName("Filter"));
 
@@ -429,7 +430,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       dst_memory_p =
           handler.AcquireDstMemoryWithResidual(output, residual_param);
     } else {
-      dst_memory_p = handler.AcquireDstMemory(output);
+      dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
     }
 
     auto conv_p = handler.AcquireForwardPrimitive();
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 86c1c3232644a1fed236563a65a16bc2f6466d49..540642c7140e707441ad9c4d71ae9b777863a7bd 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -51,11 +51,11 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
     MKLDNNMemoryFormat src_fmt = input->format();
-    std::string key =
-        platform::CreateKey(src_dt, src_tz, ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key = platform::CreateKey(platform::ThreadIDasStr(), src_dt,
+                                          src_tz, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 37b6e3bb803a2b68cec54059b266bd7585ff9958..d0ecca78ae8b27451bc51a3c1561609fc470a9f8 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/operators/mean_op.h"
 
@@ -28,21 +29,29 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-
     const std::string op_type = "gaussian_random";
     auto shape = GetShape(context, op_type);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+    std::normal_distribution<T> dist(mean, std);
+
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(gen_engine);
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
     }
 
     tensor->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 55bd683f8f4283287e1bd67810170bd4082379a6..29a86a35d7b26f41745907fb6bacf30506c027a0 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,11 +48,12 @@ class QuantOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
 
     bool is_negative = ctx.Attr<bool>("is_negative_input");
-    std::string key = platform::CreateKey(src_tz, scale_data, is_negative,
-                                          ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key =
+        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
+                            is_negative, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 92e7744e3c0a459f3267f4210d42752b5ec0bcc0..5ad5ad9450503111882a9b3bc2cd9161f74d500e 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -40,11 +40,12 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
 
     auto src_tz = paddle::framework::vectorize(input->dims());
 
-    std::string key = platform::CreateKey(src_tz, scale_in, scale_out,
-                                          ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key =
+        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_in,
+                            scale_out, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<dnnl::memory> src_memory;
     std::shared_ptr<dnnl::memory> dst_memory;
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index e99ccd31714787306358d9b19b31a62ff21d5dab..f0b5f4a466a0049c53d51d8610cf115d8bfe0295 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -55,8 +55,8 @@ class NLLLossOp : public framework::OperatorWithKernel {
                               "Input(Weight) should be a 1D tensor."));
         PADDLE_ENFORCE_EQ(x_dims[1], w_dims[0],
                           platform::errors::InvalidArgument(
-                              "Input(Weight) Tensor's size should match"
-                              "to the class numer."));
+                              "Input(Weight) Tensor's size should match "
+                              "to the the total number of classes."));
       }
     }
     if (x_dims.size() == 2) {
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index 3d618805f02aa9b6d5310bfc8a79857f522f8ac5..531c175e03e5eee3eba609c322944b1398253726 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -44,6 +44,8 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
       out_data[i] = 0;
       continue;
     }
+    PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                   "label should not be out of bounds.");
     const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
     out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
   }
@@ -62,6 +64,8 @@ __global__ void GPUNLLLossForward1D_with_reduce(
   for (i = threadIdx.x; i < batch_size; i += NTHREADS) {
     const auto cur_label = label_data[i];
     if (cur_label != ignore_index) {
+      PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                     "label should not be out of bounds.");
       const auto cur_weight = weight_data ? weight_data[cur_label] : (T)1;
       sharedInputs[threadIdx.x] -=
           x_data[i * n_classes + cur_label] * cur_weight;
@@ -198,6 +202,8 @@ __global__ void GPUNLLLossForward2D_no_reduce(
       out_data[index] = 0;
       continue;
     }
+    PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                   "label should not be out of bounds.");
     const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
     out_data[index] =
         -x_data[b * sample_size + cur_label * map_size + h * in_dim3 + w] *
@@ -226,6 +232,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
        i < map_nelem; i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
+      PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                     "label should not be out of bounds.");
       const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
       input_sum -= x_data[ioffset + i + map_nelem * cur_label] * cur_weight;
       acc_weight += cur_weight;
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
index 92f3d169f3f6a3be1009d84ebd87c82691eb9f0c..e93d5792205900635093e5f18d715e4607f73cda 100644
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
@@ -91,7 +91,7 @@ static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
           }
           PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                             platform::errors::InvalidArgument(
-                                "label should nor be out of bounds."));
+                                "label should not be out of bounds."));
           const auto cur_weight =
               weight_data ? weight_data[cur_label] : static_cast<T>(1);
           out_data[index] = -x_data[i * sample_size + cur_label * map_size +
@@ -117,7 +117,7 @@ static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
         }
         PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                           platform::errors::InvalidArgument(
-                              "label should nor be out of bounds."));
+                              "label should not be out of bounds."));
         const auto cur_weight =
             weight_data ? weight_data[cur_label] : static_cast<T>(1);
         total_weight_val += cur_weight;
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 92ce600f22b64f82a053233dbd130adefca964fa..7f0b2b7d064ed12875577fee2265ab17c1fce08f 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -25,15 +25,11 @@ class DGCMomentumOp : public MomentumOp {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
-                      "current_step should be set.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
-                      platform::errors::NotFound(
-                          "Input(nranks) of DGCMomentumOp is not found."));
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Grad_out"), true,
-                      platform::errors::NotFound(
-                          "Output(Grad_out) of DGCMomentumOp is not found."));
+    OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
+                   "DGCMomentumOp");
+    OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCMomentumOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Grad_out"), "Output", "Grad_out",
+                   "DGCMomentumOp");
     return MomentumOp::InferShape(ctx);
   }
 
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 057a7a38e3f40fdeb400418740dab825f532054c..59035d5a8ca5d4214f1370e1b14b2be9b234fa6a 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -25,34 +25,54 @@ class PnormOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor) A tensor of rank >= axis.");
     AddAttr<float>("porder",
-                   "The porder is the p order vector norm to calculate.")
+                   "(float, default 2) The porder is the p order vector norm "
+                   "to calculate. Available for porder=0, inf, -inf and any "
+                   "real number.")
         .SetDefault(2.0f);
     AddAttr<int>("axis",
-                 "The axis on which to apply normalization. If axis < 0, "
+                 "The axis on which to apply norm operation. If axis < 0, "
                  "the dimension to pnorm is rank(X) + axis. -1 is "
                  "the last dimension.")
         .SetDefault(-1);
     AddAttr<float>("epsilon",
-                   "(float, default 1e-10) The epsilon value is used "
+                   "(float, default 1e-12) The epsilon value is used "
                    "to avoid division by zero.")
         .SetDefault(1.0e-12f);
     AddAttr<bool>(
         "keepdim",
-        "(bool, default false) Whether to keep the dimensions as the input")
+        "(bool, default false) Whether to keep the dimensions as the input.")
         .SetDefault(false);
-    AddOutput(
-        "Out",
-        "(Tensor) Output tensor for the `(sum(x.pow(p)) + epsion).pow(1/p)`");
+
+    AddAttr<bool>("asvector",
+                  "(bool, default false) as vector norm when axis is None and "
+                  "input is matrix, ")
+        .SetDefault(false);
+    AddOutput("Out", "(Tensor) Output result tensor of p-norm");
     AddComment(R"DOC(
+Pnorm Operator.
+Given a tensor X, compute Lp-norm of X.
 
-Given a tensor, apply 2-normalization along the provided axis.
+When p = 0, defining $0^0 = 0$, the zero-norm of X is simply the number of non-zero elements of X.
+$$
+||X||_{0} = \lim_{p \rightarrow 0} \sum_i |x_i|^p
+$$
 
+When p = inf, the inf-norm of X is the maximum element of X.
 $$
-pnorm = \(\sum_i {abs\(x_i\)^p}  \)^{1/p}
+||X||_\infty = \max_i |x_i|
 $$
 
-where, $\sum_i{x_i^p}$ is calculated along the `axis` dimension.
-        
+When p = -inf, the negative-inf-norm of X is the minimum element of X.
+$$
+||X||_{-\infty} = \min_i |x_i|
+$$
+
+Otherwise, the p-norm of X follows the formula,
+$$
+||X||_{p} = (\sum_i |x_i|^p)^{1/p}
+$$
+where, $\sum_i $ is calculated along the `axis` dimension.
+
 )DOC");
   }
 };
@@ -63,31 +83,38 @@ class PnormOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "p_norm");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "p_norm");
-    auto porder = ctx->Attrs().Get<float>("porder");
-    PADDLE_ENFORCE_NE(porder, INFINITY,
-                      platform::errors::Unimplemented(
-                          "The input porder of p_norm is not support for "
-                          "porder == 0, INFINITY, -INFINITY now."));
-    PADDLE_ENFORCE_NE(porder, -INFINITY,
-                      platform::errors::Unimplemented(
-                          "The input porder of p_norm is not support for "
-                          "porder == 0, INFINITY, -INFINITY now."));
-    PADDLE_ENFORCE_GT(porder, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The input porder of p_norm is not support for "
-                          "porder <= 0, But received porder=%f.",
-                          porder));
-    auto xdim = ctx->GetInputDim("X");
+    auto x_dim = ctx->GetInputDim("X");
+    auto x_rank = x_dim.size();
     int axis = ctx->Attrs().Get<int>("axis");
     bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    if (axis < 0) axis = xdim.size() + axis;
+
+    PADDLE_ENFORCE_GE(axis, -x_rank,
+                      platform::errors::InvalidArgument(
+                          "Attr(axis) value should be in range [-R, R-1], R is "
+                          "the rank of Input(X). But received axis: %d, R: %d. "
+                          "Current Input(X)'s shape is=[%s].",
+                          axis, x_rank, x_dim));
+    PADDLE_ENFORCE_LT(axis, x_rank,
+                      platform::errors::InvalidArgument(
+                          "Attr(axis) value should be in range [-R, R-1], R is "
+                          "the rank of Input(X). But received axis: %d, R: %d. "
+                          "Current Input(X)'s shape is=[%s].",
+                          axis, x_rank, x_dim));
+
     std::vector<int> reduce_dims;
-    for (int i = 0; i < xdim.size(); ++i) {
-      if (i != axis) reduce_dims.emplace_back(xdim[i]);
+    bool asvector = ctx->Attrs().Get<bool>("asvector");
+    if (asvector) {
+      reduce_dims.emplace_back(1);
+    } else {
+      if (axis < 0) axis = x_dim.size() + axis;
+      for (int i = 0; i < x_dim.size(); ++i) {
+        if (i != axis) reduce_dims.emplace_back(x_dim[i]);
+      }
     }
-    xdim[axis] = 1;
+    x_dim[axis] = 1;
+
     if (keepdim) {
-      ctx->SetOutputDim("Out", xdim);
+      ctx->SetOutputDim("Out", x_dim);
     } else {
       ctx->SetOutputDim("Out", framework::make_ddim(reduce_dims));
     }
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index d9ac98ff880bcf42e0af5bb75b080464c5211671..ba0d46f4c73ec2683e51722033713c5cb3736643 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -49,20 +49,70 @@ __global__ void Pnorm(const T* x, const int pre,
 
   for (int i = blockIdx.x; i < num; i += gridDim.x) {
     int base = (i / post) * post * axis_n + (i % post);
-
     T sum = 0.0;
-    __shared__ T norm;
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
       const T x_ij = x[base + j * post];
       sum += inline_pow(inline_abs(x_ij), porder_t);
     }
     T reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = inline_pow(reduce_result, porder_inv);
+  }
+}
 
-    if (threadIdx.x == 0) {
-      norm = inline_pow(reduce_result, porder_inv);
-      out_norm[i] = norm;
+template <typename T, int BlockDim>
+__global__ void ZeorNorm(const T* x, const int pre,
+                         const int axis_n,  // dim in axis
+                         const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T sum = 0.0;
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      const T x_ij = x[base + j * post];
+      sum += static_cast<T>(x_ij != 0);
     }
-    __syncthreads();
+    T reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void InfNorm(const T* x, const int pre,
+                        const int axis_n,  // dim in axis
+                        const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T cur_max = inline_abs(x[base]);
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      T x_ij_abs = inline_abs(x[base + j * post]);
+      if (cur_max < x_ij_abs) cur_max = x_ij_abs;
+    }
+    T reduce_result = BlockReduce(temp_storage).Reduce(cur_max, cub::Max());
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void NegInfNorm(const T* x, const int pre,
+                           const int axis_n,  // dim in axis
+                           const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T cur_min = inline_abs(x[base]);
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      T x_ij_abs = inline_abs(x[base + j * post]);
+      if (cur_min > x_ij_abs) cur_min = x_ij_abs;
+    }
+    T reduce_result = BlockReduce(temp_storage).Reduce(cur_min, cub::Min());
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
   }
 }
 
@@ -79,9 +129,10 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     auto ndim = out_norm->dims();
     float porder = ctx.Attr<float>("porder");
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto& dev_ctx = ctx.cuda_device_context();
 
@@ -89,8 +140,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
-    Pnorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
-                                                          porder, norm);
+    if (porder == 0) {
+      ZeorNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                               norm);
+    } else if (porder == INFINITY) {
+      InfNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                              norm);
+    } else if (porder == -INFINITY) {
+      NegInfNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n,
+                                                                 post, norm);
+    } else {
+      Pnorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                            porder, norm);
+    }
   }
 };
 
@@ -112,7 +174,6 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
       pnorm_i = x_norm[i];
       yout_i = y_grad[i];
     }
-
     __syncthreads();
 
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
@@ -125,6 +186,33 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
   }
 }
 
+template <typename T, int BlockDim>
+__global__ void InfNormGradient(const T* x, const T* x_norm, const T* y_grad,
+                                const int pre, const int axis_n, const int post,
+                                T* x_grad) {
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    __shared__ T pnorm_i;
+    __shared__ T yout_i;
+    auto base = (i / post) * post * axis_n + (i % post);
+    if (threadIdx.x == 0) {
+      pnorm_i = x_norm[i];
+      yout_i = y_grad[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      int index = base + j * post;
+      const T x_ij = inline_abs(x[index]);
+      if (x_ij == pnorm_i) {
+        x_grad[index] = inline_sign(x[index]) * yout_i;
+      } else {
+        x_grad[index] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class PnormGradCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -143,9 +231,10 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     float porder = ctx.Attr<float>("porder");
     T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto& dev_ctx = ctx.cuda_device_context();
 
@@ -153,8 +242,17 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
-    PnormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x, x_norm, norm_dy, porder, pre, n, post, eps, dx);
+    if (porder == 0) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, out_dx, static_cast<T>(0));
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      InfNormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+          x, x_norm, norm_dy, pre, n, post, dx);
+    } else {
+      PnormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+          x, x_norm, norm_dy, porder, pre, n, post, eps, dx);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/p_norm_op.h b/paddle/fluid/operators/p_norm_op.h
index c5bdfe352723b55f80376d6644922af5de099e90..8fca6924a2541d052bb2ebce0225ba5522ff6fd5 100644
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
@@ -20,15 +20,19 @@ namespace paddle {
 namespace operators {
 
 inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
+                    int* post, bool asvector) {
   *pre = 1;
   *post = 1;
   *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
+  if (asvector) {
+    *n = product(dim);
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= dim[i];
+    }
+    for (int i = axis + 1; i < dim.size(); ++i) {
+      (*post) *= dim[i];
+    }
   }
 }
 
@@ -43,9 +47,10 @@ class PnormKernel : public framework::OpKernel<T> {
     auto xdim = in_x->dims();
     float porder = ctx.Attr<float>("porder");
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
@@ -58,10 +63,20 @@ class PnormKernel : public framework::OpKernel<T> {
     auto x = x_e.reshape(shape);
     auto norm = norm_e.reshape(norm_shape);
 
+    // p=0 means number of non-zero elements of (x)
+    // p=inf means the maximum of |x|
+    // p=-inf means the minimum of |x|
+    // otherwise, Lp-norm = pow(sum(pow(|x|, p)), 1/p)
     Eigen::DSizes<int, 1> rdim(1);
-    auto xp = (x.abs()).pow(porder);
-    auto sum = xp.sum(rdim);
-    norm.device(*place) = sum.pow(1.0f / porder);
+    if (porder == 0) {
+      norm.device(*place) = (x != x.constant(0)).template cast<T>().sum(rdim);
+    } else if (porder == INFINITY) {
+      norm.device(*place) = x.abs().maximum(rdim);
+    } else if (porder == -INFINITY) {
+      norm.device(*place) = x.abs().minimum(rdim);
+    } else {
+      norm.device(*place) = x.abs().pow(porder).sum(rdim).pow(1.0f / porder);
+    }
   }
 };
 
@@ -81,9 +96,10 @@ class PnormGradKernel : public framework::OpKernel<T> {
     float porder = ctx.Attr<float>("porder");
 
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
     Eigen::DSizes<int, 3> shape(pre, n, post);
     Eigen::DSizes<int, 3> rshape(pre, 1, post);
 
@@ -102,10 +118,20 @@ class PnormGradKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 1> rdim(1);
     Eigen::DSizes<int, 3> bcast(1, n, 1);
 
-    dx.device(*place) = (x.abs()).pow(porder - 1.0f);
-    dx.device(*place) =
-        dx / ((norm.broadcast(bcast)).pow(porder - 1.0f) + x.constant(eps));
-    dx.device(*place) = dx * norm_dy.broadcast(bcast) * x.sign();
+    if (porder == 0) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, out_dx, static_cast<T>(0));
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      dx.device(*place) =
+          (x.abs() == norm.broadcast(bcast)).template cast<T>() * x.sign() *
+          norm_dy.broadcast(bcast);
+    } else {
+      dx.device(*place) =
+          (x.abs()).pow(porder - 1.0f) /
+          ((norm.broadcast(bcast)).pow(porder - 1.0f) + x.constant(eps));
+      dx.device(*place) = dx * norm_dy.broadcast(bcast) * x.sign();
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d41b823b6551647803ae5641f72955dbbc1eb62
--- /dev/null
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -0,0 +1,912 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                         const int in_height, const int in_width,
+                         const int out_depth, const int out_height,
+                         const int out_width, const int pad_front,
+                         const int pad_top, const int pad_left, const int out_d,
+                         const int out_h, const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+       in_h >= in_height || in_w >= in_width)
+          ? value
+          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                         const int in_depth, const int in_height,
+                         const int in_width, const int out_depth,
+                         const int out_height, const int out_width,
+                         const int pad_front, const int pad_top,
+                         const int pad_left, const int out_d, const int out_h,
+                         const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+      in_h >= in_height || in_w >= in_width) {
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = value;
+    }
+  } else {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = in_data[in_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                           const int in_height, const int in_width,
+                           const int out_depth, const int out_height,
+                           const int out_width, const int pad_front,
+                           const int pad_top, const int pad_left,
+                           const int out_d, const int out_h, const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                           const int in_depth, const int in_height,
+                           const int in_width, const int out_depth,
+                           const int out_height, const int out_width,
+                           const int pad_front, const int pad_top,
+                           const int pad_left, const int out_d, const int out_h,
+                           const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                             const int in_height, const int in_width,
+                             const int out_depth, const int out_height,
+                             const int out_width, const int pad_front,
+                             const int pad_top, const int pad_left,
+                             const int out_d, const int out_h, const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                             const int in_depth, const int in_height,
+                             const int in_width, const int out_depth,
+                             const int out_height, const int out_width,
+                             const int pad_front, const int pad_top,
+                             const int pad_left, const int out_d,
+                             const int out_h, const int out_w, const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                            const int in_height, const int in_width,
+                            const int out_depth, const int out_height,
+                            const int out_width, const int pad_front,
+                            const int pad_top, const int pad_left,
+                            const int out_d, const int out_h, const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                            const int in_depth, const int in_height,
+                            const int in_width, const int out_depth,
+                            const int out_height, const int out_width,
+                            const int pad_front, const int pad_top,
+                            const int pad_left, const int out_d,
+                            const int out_h, const int out_w, const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DNCDHW(const T* in_data, const int num, const int channels,
+                const int in_depth, const int in_height, const int in_width,
+                const int out_depth, const int out_height, const int out_width,
+                const int pad_front, const int pad_top, const int pad_left,
+                T value, T* out_data,
+                void (*pad_func)(const T*, T*, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(in_data, out_data, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, out_d, out_h, out_w, value);
+          }
+        }
+      }
+      in_data += in_depth * in_height * in_width;
+      out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DNDHWC(const T* in_data, const int num, const int channels,
+                const int in_depth, const int in_height, const int in_width,
+                const int out_depth, const int out_height, const int out_width,
+                const int pad_front, const int pad_top, const int pad_left,
+                T value, T* out_data,
+                void (*pad_func)(const T*, T*, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(in_data, out_data, channels, in_depth, in_height, in_width,
+                   out_depth, out_height, out_width, pad_front, pad_top,
+                   pad_left, out_d, out_h, out_w, value);
+        }
+      }
+    }
+    in_data += in_depth * in_height * in_width * channels;
+    out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth,
+                         const int in_height, const int in_width,
+                         const int out_depth, const int out_height,
+                         const int out_width, const int pad_front,
+                         const int pad_top, const int pad_left, const int out_d,
+                         const int out_h, const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
+        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels,
+                         const int in_depth, const int in_height,
+                         const int in_width, const int out_depth,
+                         const int out_height, const int out_width,
+                         const int pad_front, const int pad_top,
+                         const int pad_left, const int out_d, const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      d_in_data[in_index + c] = d_out_data[out_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                           const int in_depth, const int in_height,
+                           const int in_width, const int out_depth,
+                           const int out_height, const int out_width,
+                           const int pad_front, const int pad_top,
+                           const int pad_left, const int out_d, const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                           const int channels, const int in_depth,
+                           const int in_height, const int in_width,
+                           const int out_depth, const int out_height,
+                           const int out_width, const int pad_front,
+                           const int pad_top, const int pad_left,
+                           const int out_d, const int out_h, const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                             const int in_depth, const int in_height,
+                             const int in_width, const int out_depth,
+                             const int out_height, const int out_width,
+                             const int pad_front, const int pad_top,
+                             const int pad_left, const int out_d,
+                             const int out_h, const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                             const int channels, const int in_depth,
+                             const int in_height, const int in_width,
+                             const int out_depth, const int out_height,
+                             const int out_width, const int pad_front,
+                             const int pad_top, const int pad_left,
+                             const int out_d, const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                            const int in_depth, const int in_height,
+                            const int in_width, const int out_depth,
+                            const int out_height, const int out_width,
+                            const int pad_front, const int pad_top,
+                            const int pad_left, const int out_d,
+                            const int out_h, const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                            const int channels, const int in_depth,
+                            const int in_height, const int in_width,
+                            const int out_depth, const int out_height,
+                            const int out_width, const int pad_front,
+                            const int pad_top, const int pad_left,
+                            const int out_d, const int out_h, const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels,
+                    const int in_depth, const int in_height, const int in_width,
+                    const int out_depth, const int out_height,
+                    const int out_width, const int pad_front, const int pad_top,
+                    const int pad_left, const T* d_out_data,
+                    void (*pad_func)(T*, const T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(d_in_data, d_out_data, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, out_d, out_h, out_w);
+          }
+        }
+      }
+      d_in_data += in_depth * in_height * in_width;
+      d_out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels,
+                    const int in_depth, const int in_height, const int in_width,
+                    const int out_depth, const int out_height,
+                    const int out_width, const int pad_front, const int pad_top,
+                    const int pad_left, const T* d_out_data,
+                    void (*pad_func)(T*, const T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(d_in_data, d_out_data, channels, in_depth, in_height,
+                   in_width, out_depth, out_height, out_width, pad_front,
+                   pad_top, pad_left, out_d, out_h, out_w);
+        }
+      }
+    }
+    d_in_data += in_depth * in_height * in_width * channels;
+    d_out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+static inline std::vector<int> GetPaddings(
+    const framework::ExecutionContext& context) {
+  std::vector<int> paddings(6);
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    auto paddings_data = paddings_t->data<int>();
+    std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int));
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings.data());
+  }
+  return paddings;
+}
+
+template <typename T>
+class Pad3dCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = static_cast<T>(context.Attr<float>("value"));
+
+    auto* x = context.Input<Tensor>("X");
+    auto in_dims = x->dims();
+    const T* in_data = x->data<T>();
+
+    auto* out = context.Output<Tensor>("Out");
+    if (data_format == "NCDHW") {
+      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5],
+                   in_dims[3] + pads[2] + pads[3],
+                   in_dims[4] + pads[0] + pads[1]});
+    } else {
+      out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5],
+                   in_dims[2] + pads[2] + pads[3],
+                   in_dims[3] + pads[0] + pads[1], in_dims[4]});
+    }
+    auto out_dims = out->dims();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    int channels = in_dims[1];
+    int in_depth = in_dims[2];
+    int in_height = in_dims[3];
+    int in_width = in_dims[4];
+    int out_depth = out_dims[2];
+    int out_height = out_dims[3];
+    int out_width = out_dims[4];
+    if (data_format == "NDHWC") {
+      channels = in_dims[4];
+      in_depth = in_dims[1];
+      in_height = in_dims[2];
+      in_width = in_dims[3];
+      out_depth = out_dims[1];
+      out_height = out_dims[2];
+      out_width = out_dims[3];
+    }
+
+    if (mode == "reflect") {
+      PADDLE_ENFORCE_GT(in_depth, pads[4],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_front"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_front(%d).",
+                            in_depth, pads[4]));
+      PADDLE_ENFORCE_GT(in_depth, pads[5],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_back"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_back(%d).",
+                            in_depth, pads[5]));
+
+      PADDLE_ENFORCE_GT(in_height, pads[2],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_top"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_top(%d).",
+                            in_height, pads[2]));
+      PADDLE_ENFORCE_GT(in_height, pads[3],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_bottom"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_bottom(%d).",
+                            in_height, pads[3]));
+
+      PADDLE_ENFORCE_GT(in_width, pads[0],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_left"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_left(%d).",
+                            in_width, pads[0]));
+      PADDLE_ENFORCE_GT(in_width, pads[1],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_right"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_right(%d).",
+                            in_width, pads[1]));
+    }
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = in_dims[0];
+    if (data_format == "NCDHW") {
+      std::map<std::string,
+               void (*)(const T*, T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int, const T)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DFuncNCDHW;
+      func_map["replicate"] = ReplicatePad3DFuncNCDHW;
+      func_map["circular"] = CircularPad3DFuncNCDHW;
+      func_map["constant"] = ConstPad3DFuncNCDHW;
+      Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width,
+                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+                 value, out_data, func_map[mode]);
+    } else {
+      std::map<std::string, void (*)(const T*, T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const T)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DFuncNDHWC;
+      func_map["replicate"] = ReplicatePad3DFuncNDHWC;
+      func_map["circular"] = CircularPad3DFuncNDHWC;
+      func_map["constant"] = ConstPad3DFuncNDHWC;
+      Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width,
+                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+                 value, out_data, func_map[mode]);
+    }
+  }
+};
+
+template <typename T>
+class Pad3dGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+             d_in, static_cast<T>(0));
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = d_in_dims[0];
+    if (data_format == "NCDHW") {
+      const int channels = d_in_dims[1];
+      const int in_depth = d_in_dims[2];
+      const int in_height = d_in_dims[3];
+      const int in_width = d_in_dims[4];
+      const int out_depth = d_out_dims[2];
+      const int out_height = d_out_dims[3];
+      const int out_width = d_out_dims[4];
+
+      std::map<std::string,
+               void (*)(T*, const T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DGradNCDHW;
+      func_map["replicate"] = ReplicatePad3DGradNCDHW;
+      func_map["circular"] = CircularPad3DGradNCDHW;
+      func_map["constant"] = ConstPad3DGradNCDHW;
+
+      Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, d_out_data, func_map[mode]);
+    } else {
+      const int channels = d_in_dims[4];
+      const int in_depth = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_depth = d_out_dims[1];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+
+      std::map<std::string,
+               void (*)(T*, const T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int, const int)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DGradNDHWC;
+      func_map["replicate"] = ReplicatePad3DGradNDHWC;
+      func_map["circular"] = CircularPad3DGradNDHWC;
+      func_map["constant"] = ConstPad3DGradNDHWC;
+
+      Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, d_out_data, func_map[mode]);
+    }
+  }
+};
+
+class Pad3dOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 5,
+                      platform::errors::InvalidArgument(
+                          "The size of Input(X)'s dimension should be equal to "
+                          "5, but received %d. ",
+                          x_dim.size()));
+
+    std::vector<int64_t> out_dims(x_dim.size());
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    out_dims[0] = x_dim[0];
+    if (ctx->HasInput("Paddings")) {
+      auto paddings_dim = ctx->GetInputDim("Paddings");
+      PADDLE_ENFORCE_EQ(paddings_dim.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Size of Input(Paddings)'s dimension should be "
+                            "equal to 1, but received %d.",
+                            paddings_dim.size()));
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(paddings_dim[0], 6,
+                          platform::errors::InvalidArgument(
+                              "Shape of Input(Paddings) should be equal to "
+                              "[6], but received [%d].",
+                              paddings_dim[0]));
+      }
+      out_dims[1] = x_dim[1];
+      out_dims[2] = x_dim[2];
+      out_dims[3] = x_dim[3];
+    } else {
+      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE_EQ(
+          paddings.size(), 6,
+          platform::errors::InvalidArgument(
+              "Size of paddings should be equal to 4, but received %d.",
+              static_cast<int>(paddings.size())));
+      if (data_format == "NCDHW") {
+        out_dims[1] = x_dim[1];  // channel
+        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
+                          ? x_dim[2]
+                          : (x_dim[2] + paddings[4] + paddings[5]);  // depth
+
+        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
+                          ? x_dim[3]
+                          : (x_dim[3] + paddings[2] + paddings[3]);  // height
+
+        out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0))
+                          ? x_dim[4]
+                          : (x_dim[4] + paddings[0] + paddings[1]);  // width
+      } else {                                                       // NDHWC
+        out_dims[4] = x_dim[4];                                      // channel
+
+        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
+                          ? x_dim[1]
+                          : (x_dim[1] + paddings[4] + paddings[5]);  // depth
+        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
+                          ? x_dim[2]
+                          : (x_dim[2] + paddings[2] + paddings[3]);  // height
+        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
+                          ? x_dim[3]
+                          : (x_dim[3] + paddings[0] + paddings[1]);  // width
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class Pad3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad3d op. "
+             "The input should be a 5-D tensor with formate NCDHW or NDHWC.");
+    AddOutput("Out",
+              "The output of pad3d op. "
+              "A tensor with the same shape as X.");
+    AddInput("Paddings",
+             "A 1-D tensor to describe the padding rules."
+             "paddings=[0, 1, 2, 3, 4, 5] means "
+             "padding 0 column to left, 1 column to right, "
+             "2 row to top, 3 row to bottom, 4 depth to front "
+             "and 5 depth to back. Size of paddings must be 6.")
+        .AsDispensable();
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules."
+        "paddings=[0, 1, 2, 3, 4, 5] means "
+        "padding 0 column to left, 1 column to right, "
+        "2 row to top, 3 row to bottom, 4 depth to front "
+        "and 5 depth to back. Size of paddings must be 6.");
+    AddAttr<float>("value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas in constant mode.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>(
+        "mode",
+        "(string, default constant) "
+        "Four modes: constant(default), reflect, replicate, circular.")
+        .SetDefault("constant");
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCDHW) Only used in "
+        "An optional string from: \"NDHWC\", \"NCDHW\". "
+        "Defaults to \"NDHWC\". Specify the data format of the input data.")
+        .SetDefault("NCDHW");
+    AddComment(R"DOC(
+Pad3d Operator.
+Pad 3-d images according to 'paddings' and 'mode'. 
+If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+than width-1. The height and depth dimension have the same condition.
+
+Given that X is a channel of image from input:
+
+X = [[[[[1, 2, 3],
+     [4, 5, 6]]]]]
+
+Case 0:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'constant'
+pad_value = 0
+
+Out = [[[[[0. 0. 0. 0. 0. 0. 0.]
+          [0. 0. 1. 2. 3. 0. 0.]
+          [0. 0. 4. 5. 6. 0. 0.]
+          [0. 0. 0. 0. 0. 0. 0.]]]]]
+
+Case 1:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'reflect'
+
+Out = [[[[[6. 5. 4. 5. 6. 5. 4.]
+          [3. 2. 1. 2. 3. 2. 1.]
+          [6. 5. 4. 5. 6. 5. 4.]
+          [3. 2. 1. 2. 3. 2. 1.]]]]]
+
+Case 2:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'replicate'
+
+Out = [[[[[1. 1. 1. 2. 3. 3. 3.]
+          [1. 1. 1. 2. 3. 3. 3.]
+          [4. 4. 4. 5. 6. 6. 6.]
+          [4. 4. 4. 5. 6. 6. 6.]]]]]
+
+Case 3:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'circular'
+
+Out = [[[[[5. 6. 4. 5. 6. 4. 5.]
+          [2. 3. 1. 2. 3. 1. 2.]
+          [5. 6. 4. 5. 6. 4. 5.]
+          [2. 3. 1. 2. 3. 1. 2.]]]]]
+
+)DOC");
+  }
+};
+
+class Pad3dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d@Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "Pad3d@Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class Pad3dOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> bind) const override {
+    bind->SetInput("X", this->Input("X"));
+    if (this->HasInput("Paddings")) {
+      bind->SetInput("Paddings", this->Input("Paddings"));
+    }
+    bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    bind->SetAttrMap(this->Attrs());
+    bind->SetType("pad3d_grad");
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker,
+                  ops::Pad3dOpGradMaker<paddle::framework::OpDesc>,
+                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad,
+                  ops::Pad3dOpGradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel<float>,
+                       ops::Pad3dCPUKernel<double>, ops::Pad3dCPUKernel<int>,
+                       ops::Pad3dCPUKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel<float>,
+                       ops::Pad3dGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..672a75389ccf18d11e508ca94d45128b2e7b56b7
--- /dev/null
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -0,0 +1,788 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data,
+                                const int num, const int channels,
+                                const int in_depth, const int in_height,
+                                const int in_width, const int out_depth,
+                                const int out_height, const int out_width,
+                                const int pad_front, const int pad_top,
+                                const int pad_left, T value, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[nc * in_depth * in_height * in_width +
+                      in_d * in_height * in_width + in_h * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data,
+                                const int num, const int channels,
+                                const int in_depth, const int in_height,
+                                const int in_width, const int out_depth,
+                                const int out_height, const int out_width,
+                                const int pad_front, const int pad_top,
+                                const int pad_left, T value, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    const int in_d = out_d - pad_front;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[n * in_depth * in_height * in_width * channels +
+                      in_d * in_height * in_width * channels +
+                      in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data,
+                                  const int num, const int channels,
+                                  const int in_depth, const int in_height,
+                                  const int in_width, const int out_depth,
+                                  const int out_height, const int out_width,
+                                  const int pad_front, const int pad_top,
+                                  const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);                     // reflect by 0
+    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data,
+                                  const int num, const int channels,
+                                  const int in_depth, const int in_height,
+                                  const int in_width, const int out_depth,
+                                  const int out_height, const int out_width,
+                                  const int pad_front, const int pad_top,
+                                  const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data,
+                                   const int num, const int channels,
+                                   const int in_depth, const int in_height,
+                                   const int in_width, const int out_depth,
+                                   const int out_height, const int out_width,
+                                   const int pad_front, const int pad_top,
+                                   const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data,
+                                   const int num, const int channels,
+                                   const int in_depth, const int in_height,
+                                   const int in_width, const int out_depth,
+                                   const int out_height, const int out_width,
+                                   const int pad_front, const int pad_top,
+                                   const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int in_w = in_index % in_width;
+
+    int nc = in_index / in_width;
+    const int in_h = nc % in_height;
+
+    nc /= in_height;
+    const int in_d = nc % in_depth;
+
+    nc /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+    d_in_data[in_index] =
+        d_out_data[nc * out_depth * out_height * out_width +
+                   out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int c = in_index % channels;
+    int n = in_index / channels;
+
+    const int in_w = n % in_width;
+    n /= in_width;
+
+    const int in_h = n % in_height;
+    n /= in_height;
+
+    const int in_d = n % in_depth;
+    n /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+
+    d_in_data[in_index] =
+        d_out_data[n * out_depth * out_height * out_width * channels +
+                   out_d * out_height * out_width * channels +
+                   out_h * out_width * channels + out_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data,
+                                      const int num, const int channels,
+                                      const int in_depth, const int in_height,
+                                      const int in_width, const int out_depth,
+                                      const int out_height, const int out_width,
+                                      const int pad_front, const int pad_top,
+                                      const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data,
+                                      const int num, const int channels,
+                                      const int in_depth, const int in_height,
+                                      const int in_width, const int out_depth,
+                                      const int out_height, const int out_width,
+                                      const int pad_front, const int pad_top,
+                                      const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, in_depth * 2 - in_d - 2);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNCDHW(
+    const int out_size, T* d_in_data, const int num, const int channels,
+    const int in_depth, const int in_height, const int in_width,
+    const int out_depth, const int out_height, const int out_width,
+    const int pad_front, const int pad_top, const int pad_left,
+    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNDHWC(
+    const int out_size, T* d_in_data, const int num, const int channels,
+    const int in_depth, const int in_height, const int in_width,
+    const int out_depth, const int out_height, const int out_width,
+    const int pad_front, const int pad_top, const int pad_left,
+    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data,
+                                       const int num, const int channels,
+                                       const int in_depth, const int in_height,
+                                       const int in_width, const int out_depth,
+                                       const int out_height,
+                                       const int out_width, const int pad_front,
+                                       const int pad_top, const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data,
+                                       const int num, const int channels,
+                                       const int in_depth, const int in_height,
+                                       const int in_width, const int out_depth,
+                                       const int out_height,
+                                       const int out_width, const int pad_front,
+                                       const int pad_top, const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+static inline std::vector<int> GetPaddings(
+    const framework::ExecutionContext& context) {
+  std::vector<int> paddings(6);
+  auto* paddings_data = context.Input<Tensor>("Paddings");
+  if (paddings_data) {
+    Tensor pads;
+    framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads);
+    auto pads_data = pads.data<int>();
+    std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int));
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings.data());
+  }
+  return paddings;
+}
+
+template <typename T>
+class Pad3dCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = static_cast<T>(context.Attr<float>("value"));
+
+    auto* x = context.Input<Tensor>("X");
+    auto in_dims = x->dims();
+    const T* in_data = x->data<T>();
+    auto* out = context.Output<Tensor>("Out");
+    auto out_dims = out->dims();
+    if (data_format == "NCDHW") {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1];
+      out_dims[2] = in_dims[2] + pads[4] + pads[5];
+      out_dims[3] = in_dims[3] + pads[2] + pads[3];
+      out_dims[4] = in_dims[4] + pads[0] + pads[1];
+    } else {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1] + pads[4] + pads[5];
+      out_dims[2] = in_dims[2] + pads[2] + pads[3];
+      out_dims[3] = in_dims[3] + pads[0] + pads[1];
+      out_dims[4] = in_dims[4];
+    }
+    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
+
+    int channels = in_dims[1];
+    int in_depth = in_dims[2];
+    int in_height = in_dims[3];
+    int in_width = in_dims[4];
+    int out_depth = out_dims[2];
+    int out_height = out_dims[3];
+    int out_width = out_dims[4];
+    if (data_format == "NDHWC") {
+      channels = in_dims[4];
+      in_depth = in_dims[1];
+      in_height = in_dims[2];
+      in_width = in_dims[3];
+      out_depth = out_dims[1];
+      out_height = out_dims[2];
+      out_width = out_dims[3];
+    }
+
+    if (mode == "reflect") {
+      PADDLE_ENFORCE_GT(in_depth, pads[4],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_front"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_front(%d).",
+                            in_depth, pads[4]));
+      PADDLE_ENFORCE_GT(in_depth, pads[5],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_back"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_back(%d).",
+                            in_depth, pads[5]));
+
+      PADDLE_ENFORCE_GT(in_height, pads[2],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_top"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_top(%d).",
+                            in_height, pads[2]));
+      PADDLE_ENFORCE_GT(in_height, pads[3],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_bottom"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_bottom(%d).",
+                            in_height, pads[3]));
+
+      PADDLE_ENFORCE_GT(in_width, pads[0],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_left"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_left(%d).",
+                            in_width, pads[0]));
+      PADDLE_ENFORCE_GT(in_width, pads[1],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_right"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_right(%d).",
+                            in_width, pads[1]));
+    }
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = out->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCDHW") {
+      if (mode == "reflect") {
+        Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "replicate") {
+        Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "circular") {
+        Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else {
+        Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            value, out_data);
+      }
+    } else {
+      if (mode == "reflect") {
+        Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "replicate") {
+        Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "circular") {
+        Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else {
+        Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+             d_in, static_cast<T>(0));
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+
+    const int num = d_in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = d_out->numel();
+    const int in_size = d_in->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCDHW") {
+      const int channels = d_in_dims[1];
+      const int in_depth = d_in_dims[2];
+      const int in_height = d_in_dims[3];
+      const int in_width = d_in_dims[4];
+      const int out_depth = d_out_dims[2];
+      const int out_height = d_out_dims[3];
+      const int out_width = d_out_dims[4];
+
+      if (mode == "reflect") {
+        Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "replicate") {
+        Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "circular") {
+        Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[4];
+      const int in_depth = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_depth = d_out_dims[1];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "replicate") {
+        Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "circular") {
+        Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel<plat::float16>,
+                        ops::Pad3dCUDAKernel<float>,
+                        ops::Pad3dCUDAKernel<double>, ops::Pad3dCUDAKernel<int>,
+                        ops::Pad3dCUDAKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel<plat::float16>,
+                        ops::Pad3dGradCUDAKernel<float>,
+                        ops::Pad3dGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 1ed7988dcfcc0831156c09a72e958852f3d45fb5..70d232ad6a51e21b863974e70920eb2d9da895e6 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -28,25 +28,44 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
                           "Output(Out) of PixelShuffleOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be a 4-D tensor of format [N, C, H, W], but got %u.",
-            input_dims.size()));
+    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "Input should be a 4-D tensor of format [N, C, H, W] "
+                          "or [N, H, W, C], but got %u.",
+                          input_dims.size()));
 
     auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
 
-    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), 0,
-                      platform::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          input_dims[1], upscale_factor * upscale_factor));
-
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC");
+
+    if (!channel_last) {
+      PADDLE_ENFORCE_EQ(
+          input_dims[1] % (upscale_factor * upscale_factor), 0,
+          platform::errors::InvalidArgument(
+              "The square of upscale_factor[%u] should divide the "
+              "number of channel[%u]",
+              input_dims[1], upscale_factor * upscale_factor));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          input_dims[3] % (upscale_factor * upscale_factor), 0,
+          platform::errors::InvalidArgument(
+              "The square of upscale_factor[%u] should divide the "
+              "number of channel[%u]",
+              input_dims[3], upscale_factor * upscale_factor));
+    }
     auto output_dims = input_dims;
     output_dims[0] = input_dims[0];
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
+    if (!channel_last) {
+      output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+      output_dims[2] = input_dims[2] * upscale_factor;
+      output_dims[3] = input_dims[3] * upscale_factor;
+    } else {
+      output_dims[1] = input_dims[1] * upscale_factor;
+      output_dims[2] = input_dims[2] * upscale_factor;
+      output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+    }
     ctx->SetOutputDim("Out", output_dims);
   }
 };
@@ -54,14 +73,14 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
 class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(
-        "X",
-        "(Tensor, default Tensor<float>), "
-        "the input feature data of PixelShuffleOp, the layout is [N C H W].");
-    AddOutput(
-        "Out",
-        "(Tensor, default Tensor<float>), the output of "
-        "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PixelShuffleOp, the layout is [N, C, "
+             "H, W] or [N, H, W, C].");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "PixelShuffleOp. The layout is [N, C/factor^2, H*factor, "
+              "W*factor] or [N, H*factor, W*factor, C/factor^2].");
     AddAttr<int>("upscale_factor",
                  "the factor to increase spatial resolution by.")
         .SetDefault(1)
@@ -70,6 +89,11 @@ class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "upscale_factor should be larger than 0."));
         });
+    AddAttr<std::string>(
+        "data_format",
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\", Specify the data format of the input data.")
+        .SetDefault("NCHW");
 
     AddComment(R"DOC(
 		Pixel Shuffle operator
@@ -114,19 +138,30 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
         platform::errors::NotFound("Output(X@Grad) should not be null"));
 
     auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        do_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be a 4-D tensor of format [N, C, H, W], but got %u.",
-            do_dims.size()));
+    PADDLE_ENFORCE_EQ(do_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "Input should be a 4-D tensor of format [N, C, H, W] "
+                          "or [N, H, W, C], but got %u.",
+                          do_dims.size()));
 
     auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
 
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC");
+
     auto dx_dims = do_dims;
     dx_dims[0] = do_dims[0];
-    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
-    dx_dims[2] = do_dims[2] / upscale_factor;
-    dx_dims[3] = do_dims[3] / upscale_factor;
+
+    if (!channel_last) {
+      dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
+      dx_dims[2] = do_dims[2] / upscale_factor;
+      dx_dims[3] = do_dims[3] / upscale_factor;
+    } else {
+      dx_dims[1] = do_dims[1] / upscale_factor;
+      dx_dims[2] = do_dims[2] / upscale_factor;
+      dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor);
+    }
     ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
   }
 };
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
index 1ae1c7e9d50cb9d701fd0e79337a1906f2f5d545..b2a0db0f838d5dcc3fed2ed9838f1c43240ce0e7 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -24,23 +25,33 @@ class PixelShuffleOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* out = ctx.Output<framework::Tensor>("Out");
+
     out->mutable_data<T>(ctx.GetPlace());
 
     int factor = ctx.Attr<int>("upscale_factor");
 
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool channel_last = (data_format == "NHWC");
+
     auto in_dims = in->dims();
     auto o_dims = out->dims();
 
     framework::Tensor t;
     t.ShareDataWith(*in);
-    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
-
+    if (!channel_last) {
+      t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+    } else {
+      t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
+    }
     std::vector<int> axis = {0, 1, 4, 2, 5, 3};
 
     framework::Tensor o;
     o.ShareDataWith(*out);
-    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
-
+    if (!channel_last) {
+      o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+    } else {
+      o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
+    }
     math::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
@@ -58,19 +69,32 @@ class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
 
     int factor = ctx.Attr<int>("upscale_factor");
 
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool channel_last = (data_format == "NHWC");
+
     auto do_dims = dout->dims();
     auto dx_dims = dx->dims();
 
     framework::Tensor t;
     t.ShareDataWith(*dout);
-    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
-
+    if (!channel_last) {
+      t.Resize(
+          {do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+    } else {
+      t.Resize(
+          {do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
+    }
     std::vector<int> axis = {0, 1, 3, 5, 2, 4};
 
     framework::Tensor o;
     o.ShareDataWith(*dx);
-    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
-
+    if (!channel_last) {
+      o.Resize(
+          {do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+    } else {
+      o.Resize(
+          {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
+    }
     math::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 354e5c60a6b9ed80f0f8c44439294bfa2731a423..7749903e5f36f1d93f7e111da4587d6828d445a4 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable,
       out->set_lod(py_out_tensor->lod());
       out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The %d-th output must be LoDTensor.", i));
     }
   }
 }
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 9f6df3f32b7463b30804555cbce4d4ee8f03a989..662fe3bcb3b3b2d26afaef0c9388dda329aea645 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -14,6 +14,8 @@
 
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
@@ -37,20 +39,30 @@ class CPURandintKernel : public framework::OpKernel<T> {
         new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
       }
     }
-
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     if (!new_shape.empty()) out->Resize(framework::make_ddim(new_shape));
     T* data = out->mutable_data<T>(ctx.GetPlace());
     int64_t size = out->numel();
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
     std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
                                           ctx.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) data[i] = dist(gen_engine);
+    } else {
+      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 64ef1c771423f2d820c73df8ed9ff25834f07875..0eb028ad806848a559ba51b9c950d324a598a851 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/place.h"
@@ -31,11 +32,17 @@ static inline void random_permate(T* data_ptr, int num, unsigned int seed) {
   for (int i = 0; i < num; ++i) {
     data_ptr[i] = static_cast<T>(i);
   }
-  if (seed == 0) {
-    seed = std::random_device()();
+  if (framework::Generator::GetInstance()->is_init_py) {
+    std::shuffle(data_ptr, data_ptr + num,
+                 framework::Generator::GetInstance()->GetCPUEngine());
+
+  } else {
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    std::srand(seed);
+    std::random_shuffle(data_ptr, data_ptr + num);
   }
-  std::srand(seed);
-  std::random_shuffle(data_ptr, data_ptr + num);
 }
 
 template <typename DeviceContext, typename T>
@@ -51,6 +58,7 @@ class RandpermKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(ctx.GetPlace())) {
       T* out_data = out_tensor->mutable_data<T>(platform::CPUPlace());
       random_permate<T>(out_data, n, seed);
+
     } else {
       framework::Tensor tmp_tensor;
       tmp_tensor.Resize(framework::make_ddim({n}));
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index e0bcab1fb547afd6250e73c309cd61d343e631ff..f13b0d800bdc7fea72010069b3f36ebe1e04488a 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -36,15 +36,30 @@ BufferedReader::~BufferedReader() {
 
 BufferedReader::BufferedReader(
     const std::shared_ptr<framework::ReaderBase> &reader,
-    const platform::Place &place, size_t buffer_size)
+    const platform::Place &place, size_t buffer_size, bool pin_memory)
     : framework::DecoratedReader(reader),
       thread_pool_(1),
       place_(place),
-      buffer_size_(buffer_size) {
+      buffer_size_(buffer_size),
+      pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place_) && !pin_memory) {
+    int dev_idx = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+    compute_stream_ =
+        ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
+                                             .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::CudaEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
-  cuda_pinned_buffer_.resize(buffer_size);
+  cuda_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -65,47 +80,103 @@ void BufferedReader::ReadAsync(size_t i) {
 
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
-      // NOTE: [Copy processing of different input devices]
-      // We may accept input tensor in three different devices:
-      //   - CPUPlace
-      //   - CUDAPinnedPlace
-      //   - CUDAPlace
-      // CUDA Stream Synchronizing is slow, in order to avoid Synchronizing
-      // in BufferedReader thread, we do data copy as follows:
-      //   - If src Tensor on CPU memory, we copy it to CUDAPinned memory
-      //   - IF src Tensor on CUDAPinned memory, we use it directly
-      //   - IF src Tensor on CUDA memory, we use it directly
-      platform::CUDAPinnedPlace cuda_pinned_place;
-      TensorVec &cuda_pinned = cuda_pinned_buffer_[i];
-      if (cuda_pinned.empty()) {
-        cuda_pinned.resize(cpu.size());
+      TensorVec &cuda = cuda_buffer_[i];
+      if (cuda.empty()) {
+        cuda.resize(cpu.size());
       } else {
         PADDLE_ENFORCE_EQ(
-            cuda_pinned.size(), cpu.size(),
+            cuda.size(), cpu.size(),
             platform::errors::InvalidArgument(
                 "Input tensor number on GPU and CPU devices are not matched."));
       }
+      if (pin_memory_) {
+        // NOTE: [Copy processing of different input devices]
+        // We may accept input tensor in three different devices:
+        //   - CPUPlace
+        //   - CUDAPinnedPlace
+        //   - CUDAPlace
+        // CUDA Stream Synchronizing is slow, in order to avoid Synchronizing
+        // in BufferedReader thread, we do data copy as follows:
+        //   - If src Tensor on CPU memory, we copy it to CUDAPinned memory
+        //   - IF src Tensor on CUDAPinned memory, we use it directly
+        //   - IF src Tensor on CUDA memory, we use it directly
+        platform::CUDAPinnedPlace cuda_pinned_place;
+        std::vector<void *> cuda_pinned_ptrs;
+        cuda_pinned_ptrs.reserve(cpu.size());
+        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        for (size_t i = 0; i < cpu.size(); ++i) {
+          if (platform::is_cpu_place(cpu[i].place())) {
+            cuda[i].Resize(cpu[i].dims());
+            cuda[i].set_layout(cpu[i].layout());
+            cuda_pinned_ptrs.emplace_back(
+                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+            auto size =
+                cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
+
+            memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
+                         BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
+                         cpu[i].data<void>(), size);
+            cuda[i].set_lod(cpu[i].lod());
+          } else {
+            // we set same place flag & use cpu[i] directly
+            is_same_place_ = true;
+          }
+        }
+      } else {
+        // NOTE(liangdun): using async copy instead of TensorCopySync
+        // TensorCopySync would block other stream, because TensorCopySync
+        // issues the copying command to the default stream, it will make two
+        // commands from different streams cannot run concurrently.
+        std::vector<void *> gpu_ptrs;
+        gpu_ptrs.reserve(cpu.size());
+        for (size_t i = 0; i < cpu.size(); ++i) {
+          cuda[i].Resize(cpu[i].dims());
+          cuda[i].set_layout(cpu[i].layout());
+          gpu_ptrs.emplace_back(cuda[i].mutable_data(place_, cpu[i].type()));
+        }
 
-      std::vector<void *> cuda_pinned_ptrs;
-      cuda_pinned_ptrs.reserve(cpu.size());
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        if (platform::is_cpu_place(cpu[i].place())) {
-          cuda_pinned[i].Resize(cpu[i].dims());
-          cuda_pinned[i].set_layout(cpu[i].layout());
-          cuda_pinned_ptrs.emplace_back(
-              cuda_pinned[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+        // NOTE(zjl): cudaStreamWaitEvent() must be called after all
+        // cuda[i].mutable_data() is called, since some ops release
+        // cuda memory immediately without waiting cuda kernel ends
+        platform::SetDeviceId(
+            BOOST_GET_CONST(platform::CUDAPlace, place_).device);
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            cudaEventRecord(events_[i].get(), compute_stream_));
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
+
+        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        for (size_t i = 0; i < cpu.size(); ++i) {
+          auto cpu_place = cpu[i].place();
+          auto cpu_ptr = cpu[i].data<void>();
+          auto gpu_ptr = gpu_ptrs[i];
           auto size =
               cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
-
-          memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
-                       BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
-                       cpu[i].data<void>(), size);
-          cuda_pinned[i].set_lod(cpu[i].lod());
-        } else {
-          // we set same place flag & use cpu[i] directly
-          is_same_place_ = true;
+          if (platform::is_cuda_pinned_place(cpu_place)) {
+            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
+                         BOOST_GET_CONST(platform::CUDAPinnedPlace, cpu_place),
+                         cpu_ptr, size, stream_.get());
+          } else if ((platform::is_gpu_place(cpu_place))) {
+            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
+                         BOOST_GET_CONST(platform::CUDAPlace, cpu_place),
+                         cpu_ptr, size, stream_.get());
+          } else {
+            platform::CUDAPinnedPlace cuda_pinned_place;
+            framework::LoDTensor cuda_pinned_tensor;
+            cuda_pinned_tensor.Resize(cpu[i].dims());
+            auto cuda_pinned_ptr = cuda_pinned_tensor.mutable_data(
+                cuda_pinned_place, cpu[i].type());
+            memory::Copy(cuda_pinned_place, cuda_pinned_ptr,
+                         BOOST_GET_CONST(platform::CPUPlace, cpu_place),
+                         cpu_ptr, size);
+            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
+                         cuda_pinned_place, cuda_pinned_ptr, size,
+                         stream_.get());
+            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
+          }
+          cuda[i].set_lod(cpu[i].lod());
         }
+        PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
       }
     }
 #endif
@@ -141,7 +212,7 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
   }
 
   *out = std::move((platform::is_gpu_place(place_) && !is_same_place_)
-                       ? cuda_pinned_buffer_[i]
+                       ? cuda_buffer_[i]
                        : cpu_buffer_[i]);
 
   // Do not push current position into ReadAsync. Push the previous position
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 4409aa4d399419a651e01ce7e279525916a29781..42c087b9e47a9ec7e80d05a791af3e04c483ab08 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -35,7 +35,8 @@ class BufferedReader : public framework::DecoratedReader {
 
  public:
   BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
-                 const platform::Place& place, size_t buffer_size);
+                 const platform::Place& place, size_t buffer_size,
+                 bool pin_memory = false);
 
   ~BufferedReader() override;
 
@@ -53,6 +54,7 @@ class BufferedReader : public framework::DecoratedReader {
   ThreadPool thread_pool_;
   platform::Place place_;
   const size_t buffer_size_;
+  bool pin_memory_;
 
   std::queue<std::future<size_t>> position_;
 
@@ -63,8 +65,13 @@ class BufferedReader : public framework::DecoratedReader {
   // buffers and prevent alloc every time.
   bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
-  std::vector<TensorVec> cuda_pinned_buffer_;
+  std::vector<TensorVec> cuda_buffer_;
   size_t prev_pos_{-1UL};
+#ifdef PADDLE_WITH_CUDA
+  cudaStream_t compute_stream_;
+  std::shared_ptr<platform::CudaStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..322a1637f5deec909db13f1bd0433446cd7606ae
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class LogsumexpOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "logsumexp"; }
+  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+};
+
+template <typename T>
+class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("logsumexp_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+                  ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
+                  ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+
+REGISTER_OP_CPU_KERNEL(logsumexp,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::LogsumexpFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::LogsumexpFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c25e5d01b2758a96192d6fbf8f4e881770cbbbf0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+
+REGISTER_OP_CUDA_KERNEL(logsumexp,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::LogsumexpFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d0e00262a37ff7160abd7a865e63377f8b30461
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct LogsumexpFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    auto x_dim = x->dimensions();
+    auto t_dim = x_dim;
+    for (int i = 0; i < static_cast<int>(dim.size()); i++) {
+      t_dim[dim[i]] = 1;
+    }
+
+    auto r_dim = x_dim;
+    for (int i = 0; i < static_cast<int>(r_dim.size()); i++) {
+      r_dim[i] = 1;
+    }
+    for (int i = 0; i < static_cast<int>(dim.size()); i++) {
+      r_dim[dim[i]] = x_dim[dim[i]];
+    }
+
+    auto y_dim = y->dimensions();
+    auto x_max = x->maximum(dim);
+    y->device(place) =
+        (x_max +
+         (*x - x_max.reshape(t_dim).broadcast(r_dim)).exp().sum(dim).log())
+            .reshape(y_dim);
+  }
+};
+
+struct LogsumexpGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * (*x - y->broadcast(dim)).exp();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index fccf6d46895ff46c40d0a5c20d4cf1b614ad8a9e..fdb2c57385b2bc1068c618f206bfeb6513d3d8c4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -103,11 +103,7 @@ REGISTER_OP_CPU_KERNEL(reduce_mean,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                          float, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::MeanFunctor>);
+                                         double, ops::MeanFunctor>);
 
 template <typename T>
 using CPUReduceMeanGradKernel =
@@ -115,6 +111,4 @@ using CPUReduceMeanGradKernel =
                           ops::MeanGradFunctor, true>;
 
 REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>,
-                       CPUReduceMeanGradKernel<int>,
-                       CPUReduceMeanGradKernel<int64_t>);
+                       CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index 4d3bce8fdd05e536baa5fecb4fc5a117e2031224..cc3653fcb43a4c000d0c61c9d854965fafd59a9c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -66,6 +66,4 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
-                        ops::ReduceMeanKernel<double>,
-                        ops::ReduceMeanKernel<int>,
-                        ops::ReduceMeanKernel<int64_t>);
+                        ops::ReduceMeanKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
index 12eceb33ec27298d60713e72c9cc2cf91a5e7cfb..289f574719ff03b1b09f313d05bab152f5c5d651 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
@@ -21,6 +21,4 @@ using CUDAReduceMeanGradKernel =
                           ops::MeanGradFunctor, true>;
 
 REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>,
-                        CUDAReduceMeanGradKernel<int>,
-                        CUDAReduceMeanGradKernel<int64_t>);
+                        CUDAReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 4673dc258d062b219fb90f644265cbaa4cfb82ef..67a19cb83c36f9cb6ef0cdd65e9fc04a7bb4d169 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -98,6 +99,18 @@ class ReduceKernel : public framework::OpKernel<T> {
     int out_dtype = context.Attr<int>("out_dtype");
     framework::proto::VarType::Type cast_out_dtype;
 
+    // The dims has full dim, set the reduce_all is True
+    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    std::set<int> dims_set(dims.begin(), dims.end());
+    bool full_dim = true;
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) == dims_set.end()) {
+        full_dim = false;
+        break;
+      }
+    }
+    reduce_all = (reduce_all || full_dim);
+
     if (out_dtype < 0) {
       auto* cast_input = context.Input<Tensor>("X");
       cast_out_dtype =
@@ -137,6 +150,18 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
     auto dims = context.Attr<std::vector<int>>("dim");
     bool keep_dim = context.Attr<bool>("keep_dim");
 
+    // The dims has full dim, set the reduce_all is True
+    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    std::set<int> dims_set(dims.begin(), dims.end());
+    bool full_dim = true;
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) == dims_set.end()) {
+        full_dim = false;
+        break;
+      }
+    }
+    reduce_all = (reduce_all || full_dim);
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
       auto x = EigenVector<OutT>::Flatten(*input);
@@ -183,6 +208,17 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto* output = context.Output<Tensor>(framework::GradVarName("X"));
     output->mutable_data<T>(context.GetPlace());
 
+    // The dims has full dim, set the reduce_all is True
+    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    std::set<int> dims_set(dims.begin(), dims.end());
+    bool full_dim = true;
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) == dims_set.end()) {
+        full_dim = false;
+        break;
+      }
+    }
+    reduce_all = (reduce_all || full_dim);
     // NOTE: EigenTensor::From() uses tensor->data()
     // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
     // kNoNeedBufferY should set true
@@ -200,8 +236,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
 
     if (reduce_all) {
       auto x = EigenVector<T>::Flatten(*input0);
-      auto x_reduce = EigenVector<T>::From(*input1);
-      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_reduce = EigenVector<T>::Flatten(*input1);
+      auto x_reduce_grad = EigenVector<T>::Flatten(*input2);
       auto x_grad = EigenVector<T>::Flatten(*output);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
@@ -298,6 +334,12 @@ class ReduceOp : public framework::OperatorWithKernel {
                             "range [-dimension(X), dimension(X)] "
                             "which dimesion = %d. But received dim index = %d.",
                             i, x_rank, dims[i]));
+      PADDLE_ENFORCE_GE(dims[i], -x_rank,
+                        platform::errors::InvalidArgument(
+                            "The reduce dim index %d should be in the "
+                            "range [-dimension(X), dimension(X)] "
+                            "which dimesion = %d. But received dim index = %d.",
+                            i, x_rank, dims[i]));
       if (dims[i] < 0) dims[i] = x_rank + dims[i];
     }
     sort(dims.begin(), dims.end());
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index c0fbc336e46b64fc6ee43ef1a7372e413c5c3213..1c493fc6be093a2af8f58c8e78d1be43de34306f 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -29,6 +29,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+DECLARE_bool(use_mkldnn);
 
 namespace paddle {
 namespace operators {
@@ -262,6 +267,9 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     }
     VLOG(2) << "The number of sub scopes after forward: "
             << out_scope_vec->front()->kids().size();
+#ifdef PADDLE_WITH_MKLDNN
+    if (FLAGS_use_mkldnn) DontClearMKLDNNCache(ctx.GetPlace());
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 5ec32c98f7f84abb255ec996d0cf6a58e6312ec3..a09220b1ccd13604b6d842237c8176578967ac64 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -61,7 +62,9 @@ class SamplingIdKernel : public framework::OpKernel<T> {
 
     std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
-      T r = dist(engine);
+      T r = framework::Generator::GetInstance()->is_init_py
+                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
+                : dist(engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
         if ((r -= ins_vector[i * width + j]) < 0) {
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 7c77b2688e7b528f678418c67e77fa4abff04248..0adf61d7ce3e5b5792b9dc65d5ac8f884dc81ea5 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/selu_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,11 +31,7 @@ class SeluOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "selu");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "selu");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
+    return UnaryOpUnchangedInferShape(ctx);
   }
 
  protected:
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 06eaca0216b36a50028fd7cfd3c0866a5b7c1de0..b45fa7c791ff22be422ce12a8348a071c60ddd0f 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -54,5 +54,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
+                       ops::SizeKernel<paddle::platform::float16>,
                        ops::SizeKernel<float>, ops::SizeKernel<double>,
                        ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
index 4e5846660e62543638b669d586a92fc36b0c8e87..3ea3032693236d5618ff6f0c858cbd85e34633ab 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/size_op.h"
 
-REGISTER_OP_CUDA_KERNEL(size, paddle::operators::SizeKernel<int>,
-                        paddle::operators::SizeKernel<int32_t>,
-                        paddle::operators::SizeKernel<float>,
-                        paddle::operators::SizeKernel<bool>,
-                        paddle::operators::SizeKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    size, paddle::operators::SizeKernel<int>,
+    paddle::operators::SizeKernel<int32_t>,
+    paddle::operators::SizeKernel<paddle::platform::float16>,
+    paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
+    paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 8f5df7b6d5d3cb6cee6f08edaeaa4269c70be937..d147ec3e407b0382a0ed7311cfebbe49bf14134d 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -155,6 +155,10 @@ class SliceOp : public framework::OperatorWithKernel {
           in_tensor.IsInitialized(), true,
           platform::errors::InvalidArgument(
               "The tensor Input (Input) of Slice op is not initialized."));
+      // NOTE: cuda pinned tensor need to copy its data to target place
+      if (platform::is_cuda_pinned_place(in_tensor.place())) {
+        return framework::OpKernelType(in_tensor.type(), ctx.device_context());
+      }
       return framework::OpKernelType(in_tensor.type(), in_tensor.place());
     }
     return framework::OpKernelType(
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 7528422fdc09b7894898bdee94eaa11ad2cba311..f20bada8ab288fe74fd8ca82a73522a22b234191 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca8f6ce84fc571674fdfe6f29cbcd82a98fd8fcf
--- /dev/null
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/imperative/infer_shape_context.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+USE_OP(relu);
+USE_OP(elementwise_add);
+USE_OP(softmax);
+
+namespace paddle {
+namespace operators {
+namespace details {
+
+class DygraphInferShapeTest {
+ public:
+  void AddInput(const std::string& name, const framework::DDim& dim) {
+    std::shared_ptr<imperative::VarBase> vin(
+        new imperative::VarBase(false, name));
+    vin->MutableVar()->GetMutable<framework::LoDTensor>()->Resize(dim);
+    ins_[name] = {vin};
+  }
+  void AddOutput(const std::string& name, const framework::DDim& expected_dim) {
+    std::shared_ptr<imperative::VarBase> vout(
+        new imperative::VarBase(false, name));
+    vout->MutableVar()
+        ->GetMutable<framework::LoDTensor>();  // InitializeVariable
+    outs_[name] = {vout};
+    expected_dims_[name] = expected_dim;
+  }
+  void AddAttrs(const framework::AttributeMap& attrs) { attrs_ = attrs; }
+  void SetOpType(const std::string& op_type) { op_type_ = op_type; }
+  void Run(std::function<void(framework::InferShapeContext* ctx)> infer_shape) {
+    imperative::DygraphInferShapeContext<imperative::VarBase> ctx(
+        &ins_, &outs_, &attrs_, op_type_);
+    infer_shape(&ctx);
+    for (const auto& pair : expected_dims_) {
+      auto out = outs_[pair.first][0];
+      ASSERT_EQ(pair.second,
+                out->MutableVar()->GetMutable<framework::LoDTensor>()->dims());
+    }
+  }
+
+ private:
+  imperative::NameVarBaseMap ins_;
+  imperative::NameVarBaseMap outs_;
+  framework::AttributeMap attrs_;
+  std::string op_type_;
+  std::map<std::string, framework::DDim> expected_dims_;
+};
+}  // namespace details
+
+TEST(test_UnaryOpUnchangedInferShape, test_shape) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10});
+  test.AddOutput("Out", {2, 10});
+  test.SetOpType("relu");
+  test.Run(UnaryOpUnchangedInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_same_shape) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 3, 4, 5});
+  test.AddInput("Y", {2, 3, 4, 5});
+  test.AddOutput("Out", {2, 3, 4, 5});
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_broadcast1) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 3, 4, 5});
+  test.AddInput("Y", {4, 5});
+  test.AddOutput("Out", {2, 3, 4, 5});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_broadcast2) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10, 5, 1});
+  test.AddInput("Y", {10, 1, 1});
+  test.AddOutput("Out", {2, 10, 5, 1});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_broadcast3) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {10, 1, 1});
+  test.AddInput("Y", {2, 10, 5, 5});
+  test.AddOutput("Out", {2, 10, 5, 5});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_UnaryOpUnchangedInferShapeCheckAxis, test_shape) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10});
+  test.AddOutput("Out", {2, 10});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("softmax");
+  test.Run(UnaryOpUnchangedInferShapeCheckAxis);
+}
+
+TEST(test_UnaryOpUnchangedInferShapeCheckAxis, test_axis_exception) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10});
+  test.AddOutput("Out", {2, 10});
+  test.AddAttrs({
+      {"axis", 2},
+  });
+  test.SetOpType("softmax");
+  ASSERT_ANY_THROW(test.Run(UnaryOpUnchangedInferShapeCheckAxis));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index f416aa6e00f5a4a82c2562c36f9d32bb1a6843aa..cc2fe4cdbdb8faa69abad28fbdd31dc4e61bdc04 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -41,12 +41,12 @@ static void InitRandom(framework::Tensor *tensor,
 
 template <typename T>
 struct LeakyReluGradGradEachElementFunctor {
-  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *out, T alpha,
+  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *x, T alpha,
                                       T *ddout)
-      : ddx_(ddx), out_(out), alpha_(alpha), ddout_(ddout) {}
+      : ddx_(ddx), x_(x), alpha_(alpha), ddout_(ddout) {}
 
   HOSTDEVICE void operator()(int idx) {
-    if (out_[idx] > 0) {
+    if (x_[idx] >= 0) {
       ddout_[idx] = ddx_[idx];
     } else {
       ddout_[idx] = ddx_[idx] * alpha_;
@@ -54,7 +54,7 @@ struct LeakyReluGradGradEachElementFunctor {
   }
 
   const T *ddx_;
-  const T *out_;
+  const T *x_;
   T alpha_;
   T *ddout_;
 };
@@ -66,13 +66,13 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   LeakyReluGradGradFunctor<T> functor;
   functor.alpha = alpha;
   auto &dev_ctx = *platform::DeviceContextPool::Instance().Get(place);
-  framework::Tensor *x = nullptr;
+  framework::Tensor *out = nullptr;
   framework::Tensor *dout = nullptr;
   framework::Tensor *dx = nullptr;
 
-  framework::Tensor out;
-  out.Resize(dim);
-  InitRandom<T>(&out, place);
+  framework::Tensor x;
+  x.Resize(dim);
+  InitRandom<T>(&x, place);
 
   framework::Tensor ddx;
   ddx.Resize(dim);
@@ -85,22 +85,22 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   framework::Tensor ddout_actual;
   ddout_actual.mutable_data<T>(dim, place);
   LeakyReluGradGradEachElementFunctor<T> actual_functor(
-      ddx.data<T>(), out.data<T>(), static_cast<T>(alpha),
+      ddx.data<T>(), x.data<T>(), static_cast<T>(alpha),
       ddout_actual.data<T>());
 
-  int64_t limit = out.numel();
+  int64_t limit = x.numel();
 
 #ifdef __NVCC__
   if (platform::is_gpu_place(place)) {
     auto &cuda_dev_ctx = dynamic_cast<platform::CUDADeviceContext &>(dev_ctx);
-    functor(cuda_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
+    functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<platform::CUDADeviceContext> for_range(cuda_dev_ctx,
                                                               limit);
     for_range(actual_functor);
   } else {
 #endif
     auto &cpu_dev_ctx = dynamic_cast<platform::CPUDeviceContext &>(dev_ctx);
-    functor(cpu_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
+    functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<platform::CPUDeviceContext> for_range(cpu_dev_ctx,
                                                              limit);
     for_range(actual_functor);
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da4ca87296d92fc1052f462ae6ee8a3acb05eb49
--- /dev/null
+++ b/paddle/fluid/operators/tile_op.cc
@@ -0,0 +1,265 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/tile_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class TileOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile");
+    auto x_dims = ctx->GetInputDim("X");
+    auto repeat_times = ctx->Attrs().Get<std::vector<int>>("repeat_times");
+    if (repeat_times.size() == 0) {
+      repeat_times = std::vector<int>(x_dims.size(), -1);
+    }
+
+    PADDLE_ENFORCE_LE(
+        x_dims.size(), MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must not be greater than %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, x_dims.size()));
+    PADDLE_ENFORCE_LE(
+        repeat_times.size(), MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The size of the shape of input 'repeat_times' for tile op "
+            "must not be greater than %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times.size()));
+    PADDLE_ENFORCE_GE(
+        repeat_times.size(), 1,
+        platform::errors::InvalidArgument(
+            "The size of the shape of input 'repeat_times' for tile op "
+            "must be positive integers, but the value received is %d.",
+            repeat_times.size()));
+
+    auto out_rank =
+        std::max(static_cast<size_t>(x_dims.size()), repeat_times.size());
+    std::vector<int64_t> out_shape(out_rank);
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    if (x_dim_vec.size() > repeat_times.size()) {
+      auto diff = x_dim_vec.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, -1);
+    } else {
+      auto diff = repeat_times.size() - x_dim_vec.size();
+      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+    }
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      if (x_dim_vec[i] == -1 || repeat_times[i] == -1) {
+        out_shape[i] = -1;
+      } else {
+        PADDLE_ENFORCE_GT(
+            repeat_times[i], 0,
+            platform::errors::InvalidArgument(
+                "Every element of the input 'repeat_times' for tile op must be "
+                "greater than 0, but the value given is %d.",
+                repeat_times[i]));
+        out_shape[i] = x_dim_vec[i] * repeat_times[i];
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "repeat_times_tensor" || var_name == "RepeatTimes") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class TileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). X is the input to be titled.");
+    AddInput(
+        "RepeatTimes",
+        "(Tensor<int>, optional). If provided, it is the number of repeat times"
+        " along specific axis. It has a higher priority than "
+        "repeat_times_tensor and the repeat_times attribute.")
+        .AsDispensable();
+    AddInput("repeat_times_tensor",
+             "(Tensor Tensor<int>), repeat times for X."
+             "It has a higher priority than repeat_times, but a lower priority "
+             "than RepeatTimes")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "After tiling, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(repeat_times).");
+    AddAttr<std::vector<int>>("repeat_times",
+                              "The number of repeat times for each dimension.")
+        .SetDefault({});
+    AddComment(R"DOC(
+Tile operator repeats the input by given times number. You should set times
+number for each dimension by providing attribute 'repeat_times'. The rank of X
+should be in [1, 6]. Please note that size of 'repeat_times' must be the same
+with X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(repeat_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class TileGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TileGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "TileGrad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> repeat_times =
+        ctx->Attrs().Get<std::vector<int>>("repeat_times");
+    if (repeat_times.size() == 0) {
+      repeat_times = std::vector<int>(x_dims.size(), -1);
+    }
+
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    if (x_dim_vec.size() > repeat_times.size()) {
+      auto diff = x_dim_vec.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, -1);
+    } else {
+      auto diff = repeat_times.size() - x_dim_vec.size();
+      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+    }
+
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      if (repeat_times[i] == -1 || x_dim_vec[i] == -1) {
+        continue;
+      } else {
+        if (ctx->IsRuntime()) {
+          PADDLE_ENFORCE_EQ(
+              x_dim_vec[i] * repeat_times[i], out_dims[i],
+              platform::errors::InvalidArgument(
+                  "The size (%d) of the dimension %d of Input(Out@GRAD) should "
+                  "be equal to the multiplication of the crroresponding "
+                  "dimension size of Input(X) (%d) and repeat_times (%d).",
+                  out_dims[i], i, x_dim_vec[i], repeat_times[i]));
+        }
+      }
+    }
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "repeat_times_tensor" || var_name == "RepeatTimes") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class TileGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tile_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor"));
+    op->SetInput("RepeatTimes", this->Input("RepeatTimes"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
+                  ops::TileGradOpMaker<paddle::framework::OpDesc>,
+                  ops::TileGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
+                  ops::TileGradNoNeedBufVarsInferer);
+REGISTER_OP_CPU_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, bool>);
+REGISTER_OP_CPU_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tile_op.cu b/paddle/fluid/operators/tile_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ca82cd6a1f43551cb4d461bc47e962abd097a9a
--- /dev/null
+++ b/paddle/fluid/operators/tile_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/tile_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6b0fdd720cf4be79dc403a53341b18366998a67
--- /dev/null
+++ b/paddle/fluid/operators/tile_op.h
@@ -0,0 +1,274 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define TILE_TEMPLATE(z, n, data) \
+  case n + 1: {                   \
+    Tile<n + 1>(context);         \
+    break;                        \
+  }
+#define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~)
+#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define TILE_GRAD_CASE(n)                                        \
+  case n: {                                                      \
+    TileBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                       \
+  }
+#define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), )
+#define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+inline std::vector<int> get_repeat_times(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
+    auto* repeat_data = repeat_tensor->data<int>();
+    framework::Tensor cpu_repeat_tensor;
+    if (platform::is_gpu_place(repeat_tensor->place())) {
+      TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
+      repeat_data = cpu_repeat_tensor.data<int>();
+    }
+    auto vec_repeat_times =
+        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
+    return vec_repeat_times;
+  }
+
+  auto list_repeat_times_tensor =
+      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+  if (list_repeat_times_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_repeat_times;
+    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
+      auto tensor = list_repeat_times_tensor[i];
+      if (platform::is_gpu_place(tensor->place())) {
+        framework::Tensor temp;
+        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_repeat_times.push_back(*temp.data<int32_t>());
+      } else {
+        vec_repeat_times.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_repeat_times;
+  } else {
+    return ctx.Attr<std::vector<int>>("repeat_times");
+  }
+}
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::To32BitIndex;
+
+template <typename DeviceContext, typename T>
+class TileKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1, platform::errors::InvalidArgument(
+                     "The rank of the input 'x' for tile op must be a positive "
+                     "integer, but the value received is %d.",
+                     rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto repeat_times = get_repeat_times(context);
+    int repeat_times_size = repeat_times.size();
+    PADDLE_ENFORCE_GE(
+        repeat_times_size, 1,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile "
+            "op must be positive, but the value received is %d.",
+            repeat_times_size));
+    PADDLE_ENFORCE_LE(
+        repeat_times_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times_size));
+    rank = std::max(rank, repeat_times_size);
+    switch (rank) { REP_TILE_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Tile(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+
+    auto in_dims = in0->dims();
+    auto repeat_times = get_repeat_times(context);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          repeat_times[i], 0,
+          platform::errors::InvalidArgument(
+              "All elements of the input 'repeat_times' for tile op must "
+              "be positive integers, but the value received is %d.",
+              repeat_times[i]));
+    }
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    PADDLE_ENFORCE_EQ(
+        repeat_times.size(), vec_in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The rank (%d) of the input 'x' and the rank (%d) of the input "
+            "'repeat_times' for tile op must match after promotion.",
+            vec_in_dims.size(), repeat_times.size()));
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      bcast_dims[i] = repeat_times[i];
+    }
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+
+    out0->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    // use 32-bit index to speed up
+    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
+    if (use_32bit_index) {
+      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+    } else {
+      y.device(place) = x.broadcast(bcast_dims);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TileGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto repeat_times = get_repeat_times(context);
+    auto x_dims = in0->dims();
+    auto vec_in_dims = framework::vectorize<int>(x_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    // 1. reshape_dims_vec is the broadcast parameter.
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      reshape_dims_vec.push_back(repeat_times[i]);
+      reshape_dims_vec.push_back(vec_in_dims[i]);
+    }
+
+    int dims = reduce_dims_vec.size();
+
+    bool just_copy = true;
+    for (size_t i = 0; i < repeat_times.size(); i++) {
+      if (repeat_times[i] != 1) {
+        just_copy = false;
+        break;
+      }
+    }
+    // no need reduce, just copy
+    if (just_copy) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
+    } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "Th rank of the input 'Out@GRAD' for tile_grad op "
+                            " must be greater than or equal to 1, but "
+                            "the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for tile_grad op "
+                            "must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
+      switch (dims) { REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void TileBackward(const framework::ExecutionContext& context,
+                    const std::vector<int>& reshape_dims_vec,
+                    const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = reshape_dims_vec.size();
+    size_t reduce_size = reduce_dims_vec.size();
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims)
+            .sum(reduce_dims)
+            .reshape(x_grad.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..57891699fd2ad73a1cccce26438528657afdf340
--- /dev/null
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdio.h>
+#include <cstdio>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<paddle::platform::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
+                 paddle::platform::float16> {};
+}  // namespace cub
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+// Iter using into a column
+struct ColumnIndexIter {
+  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+inline static int GetDesiredBlockDim(int dim) {
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+template <typename T>
+__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
+    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator>(const T value) const {
+    return (v > value);
+  }
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size, const bool& largest) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (largest) {
+      if (topk[k] < p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    } else {
+      if (topk[k] > p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size,
+                                        const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size, const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp < max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp > max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
+                                              int beam_size, const T* src,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
+                                              const int tid, bool largest) {
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length, largest);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
+        } else {
+          topk[k].set(-static_cast<T>(INFINITY), -1);
+        }
+      }
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
+                              length, largest);
+      }
+    }
+
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
+    *beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int* beam, int* k,
+                                            const int tid, const int warp,
+                                            const bool& largest) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (largest) {
+        if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      } else {
+        if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (largest) {
+          if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        } else {
+          if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
+      }
+    }
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (maxid[0] / 32 == warp) {
+      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
+          MaxLength)
+        break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k,
+                             int grid_dim, int num, bool largest = true) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+
+  const int bid = blockIdx.x;
+  for (int i = bid; i < num; i += grid_dim) {
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
+    Pair<T> topk[MaxLength];
+    int beam = MaxLength;
+    Pair<T> max;
+    bool is_empty = false;
+    bool firststep = true;
+
+    for (int j = 0; j < MaxLength; j++) {
+      if (largest) {
+        topk[j].set(-static_cast<T>(INFINITY), -1);
+      } else {
+        topk[j].set(static_cast<T>(INFINITY), -1);
+      }
+    }
+    while (top_num) {
+      ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, src + i * lds,
+                                             &firststep, &is_empty, &max, dim,
+                                             tid, largest);
+
+      sh_topk[tid] = topk[0];
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp, largest);
+    }
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
+                           size_t rows, size_t cols, size_t k) {
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t j = 0; j < cols; ++j) {
+      x_grad[i * cols + j] = 0;
+    }
+    for (size_t j = 0; j < k; ++j) {
+      size_t idx = indices[i * k + j];
+      x_grad[i * cols + idx] = out_grad[i * k + j];
+    }
+  }
+}
+
+// the grad assign with the axis
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
+                                   T* grad_in, int pre, int post,
+                                   int raw_height, int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    const int& base_index = i * post * k;
+    const int& base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      const int64_t idx_ij = indices[base_index + j];
+      const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[idx_ij];
+    }
+  }
+}
+// use the radix sort for the topk
+template <typename T>
+bool SortTopk(const platform::CUDADeviceContext& ctx,
+              const framework::Tensor* input_tensor, const int64_t num_cols,
+              const int64_t num_rows, const int k,
+              framework::Tensor* out_tensor, framework::Tensor* indices_tensor,
+              bool largest = true) {
+  auto cu_stream = ctx.stream();
+
+  Tensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = framework::make_ddim(dims);
+  input_indices.Resize(dim);
+  // input_indices.Resize(num_rows*num_cols);
+  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(num_cols);
+
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  // actually, int num_rows < max_grid_size
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  // Init a index array
+  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+
+  // create iter for counting input
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+
+  Tensor temp_values;
+  Tensor temp_indices;
+
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+
+  if (k == num_cols) {
+    // Doing a full sort.
+    sorted_values_ptr = values;
+    sorted_indices_ptr = indices;
+  } else {
+    temp_values.Resize(dim);
+    temp_indices.Resize(dim);
+    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+  }
+
+  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
+  // time.
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR)
+          << "TopKOP failed as could not launch "
+             "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+             "temp_storage_bytes, status: "
+          << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  Tensor temp_storage;
+  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  auto& dev = *ctx.eigen_device();
+  if (k < num_cols) {
+    // copy sliced data to output.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+
+    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
+    auto dim = framework::make_ddim(odims);
+    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+
+    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
+    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+  }
+  return true;
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 82ecc2887ba240560cf15165f21bc995f4683159..d8b2e92616091a8c822c6fd0bfdfb1148c25534d 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
 #include <cstdio>
+#include <vector>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 // set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* src,
-                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>* max, int dim,
-                                              const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool* firstStep,
-                                              bool* is_empty, Pair<T>* max,
-                                              int dim, const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int* beam, int* k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) (*beam)++;
-    if (--(*k) == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[*beam];
-      }
-    }
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (maxid[0] / 32 == warp) {
-      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
-          MaxLength)
-        break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-static_cast<T>(INFINITY), -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
-
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
-                           size_t rows, size_t cols, size_t k) {
-  for (size_t i = 0; i < rows; ++i) {
-    for (size_t j = 0; j < cols; ++j) {
-      x_grad[i * cols + j] = 0;
-    }
-    for (size_t j = 0; j < k; ++j) {
-      size_t idx = indices[i * k + j];
-      x_grad[i * cols + idx] = out_grad[i * k + j];
-    }
-  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-// Iter using into a column
-struct ColumnIndexIter {
-  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
-      const Eigen::array<int, 1>& ix) const {
-    return ix[0] % num_cols_;
-  }
-
-  int num_cols_;
-};
-
-__global__ void InitIndex(int64_t* indices, int64_t num_rows,
-                          int64_t num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
-    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T>
-bool SortTopk(const platform::CUDADeviceContext& ctx,
-              const framework::Tensor* input_tensor, const int64_t num_cols,
-              const int64_t num_rows, const int k,
-              framework::Tensor* out_tensor,
-              framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = framework::make_ddim(dims);
-  input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](int col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
-  // actually, int num_rows < max_grid_size
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  // Init a index array
-  InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-
-  // create iter for counting input
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-
-  Tensor temp_values;
-  Tensor temp_indices;
-
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-
-  if (k == num_cols) {
-    // Doing a full sort.
-    sorted_values_ptr = values;
-    sorted_indices_ptr = indices;
-  } else {
-    temp_values.Resize(dim);
-    temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  }
-
-  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
-  // time.
-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
-           "temp_storage_bytes, status: "
-        << cudaGetErrorString(err);
-    return false;
-  }
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-           "temp_storage_bytes: "
-        << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-  auto& dev = *ctx.eigen_device();
-  if (k < num_cols) {
-    // copy sliced data to output.
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
-
-    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
-    auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
-
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
-  }
-  return true;
-}
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
     constexpr auto kBlockDim = (dim);  \
@@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
         framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const int64_t input_width = inputdims[inputdims.size() - 1];
     const auto& dev_ctx = ctx.cuda_device_context();
-
     if ((input_width <= 1024 || k >= 128 || k == input_width)) {
       if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
                       indices)) {
@@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
         framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
     const size_t col = xdims[xdims.size() - 1];
     const auto& dev_ctx = context.cuda_device_context();
-
     const int kMaxHeight = 2048;
     int gridx = row < kMaxHeight ? row : kMaxHeight;
     switch (GetDesiredBlockDim(col)) {
@@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
     top_k,
     paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc72d83411f5a34561a75e7e75f98077ee5a4e5d
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class TopkV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
+                      "the axis of topk"
+                      "must be [-%d, %d), but you set axis is %d",
+                      dim_size, dim_size, axis);
+
+    if (axis < 0) axis += dim_size;
+
+    PADDLE_ENFORCE_GE(
+        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      "input of topk must have >= 1d shape");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(
+          input_dims[axis], k,
+          "input of topk op must have >= %d columns in axis of %d", k, axis);
+    }
+
+    framework::DDim dims = input_dims;
+
+    dims[axis] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the tensor).")
+        .SetDefault(1);
+    AddAttr<int>("axis",
+                 "the axis to sort and get the k indices, value."
+                 "if not set, will get k value in last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("largest",
+                  "control flag whether to return largest or smallest")
+        .SetDefault(true);
+    AddAttr<bool>("sorted",
+                  "control flag whether to return elements in sorted order")
+        .SetDefault(true);
+  }
+};
+
+class TopkV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("top_k_v2_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
+                  ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(top_k_v2,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
+
+REGISTER_OP_CPU_KERNEL(
+    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c94dca1e3a461a44b98e9acf604cc4b488b5fd7
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cu
@@ -0,0 +1,271 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename DeviceContext, typename T>
+class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+
+    // get the attributes
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
+    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    const auto& out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      // if get the topK from the last axis
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
+                        indices, largest)) {
+          // Successed, return.
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      // NOTE: pass lds and dim same to input width.
+      // NOTE: old matrix implementation of stride is different to eigen.
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                output_data, k, indices_data, input_data, input_width,
+                input_width, static_cast<int>(k), gridx, input_height,
+                largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+    } else {
+      // if get topK not from the last axis, will tranpose the tensor and get
+      // TopK
+
+      // first step, prepare the trans args for the tranpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+      // second step, tranpose the input
+      Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      // third step, calcluate the topk
+      // allocate the tmp cuda memory for the tmp result
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      Tensor trans_out;
+      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
+                        &trans_out, &trans_ind, largest)) {
+          // last step, tranpose back the indices and output
+          TransCompute<platform::CUDADeviceContext, int64_t>(
+              ndims, dev_ctx, trans_ind, indices, trans);
+          TransCompute<platform::CUDADeviceContext, T>(
+              ndims, dev_ctx, trans_out, output, trans);
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
+                trans_input.data<T>(), input_width, input_width,
+                static_cast<int>(k), gridx, input_height, largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+    }
+  }
+};
+
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+template <typename DeviceContext, typename T>
+class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // get the real the axis and the k
+    if (axis < 0) axis += in_dims.size();
+    const int& k = out_dims[axis];
+    const int& raw_height = in_dims[axis];
+
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    auto ComputeBlockSize = [](int col) {
+      if (col > 512)
+        return 1024;
+      else if (col > 256 && col <= 512)
+        return 512;
+      else if (col > 128 && col <= 256)
+        return 256;
+      else if (col > 64 && col <= 128)
+        return 128;
+      else
+        return 64;
+    };
+    int block_size = ComputeBlockSize(post * k);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+
+    // lanuch the cuda kernel to assign the grad
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
+                       paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int64_t>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..89b5d36b1b3f915e8719c8791e8c12c2e0348f26
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -0,0 +1,334 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+  The reason why we need the topk v2 is because the compatibility. We redefine
+  the NaN is maximum value
+  in the process of comparing. If do not add the topk v2,  will affect the
+  inference result of model that traing
+  by the older version paddlepaddle.
+*/
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height, Type input_width, int input_dim,
+                     const framework::Tensor* input, T* t_out, Type* t_indices,
+                     const int& k, const bool& largest, const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(), col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height, const Type& input_width,
+                           const int& input_dim, const framework::Tensor* input,
+                           const framework::Tensor* indices, T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TopkV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Get the top k elements of each row of input tensor
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    int k = static_cast<int>(context.Attr<int>("k"));
+    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    // axis < 0, cacluate the real axis
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    // if K tensor is not null, will the use K tesnor as k
+    auto* k_t = context.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      // accroding to axis to set K value in the dim
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    const auto& out_dims = output->dims();
+    if (axis + 1 == in_dims.size()) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                           output_data, indices_data, k, largest, sorted);
+    } else {
+      // if the topk dims is not last dim, will tranpose and do topk
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.push_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      // get the trans input_dims, out_dims
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+      }
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+
+      Tensor trans_inp;
+      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // transpose the input value
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_inp, trans);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      // Allocate the temp tensor to the save the topk indices, values
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
+      Tensor tmp_indices;
+      auto* t_ind =
+          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
+
+      // get the TopK value
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
+                           &trans_inp, t_out, t_ind, k, largest, sorted);
+      // transpose back
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TopkV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // axis < 0, get the real axis
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    const size_t& k = out_dims[axis];
+
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis + 1 == in_dims.size()) {
+      // allocate the memory for the input_grad
+
+      // assign the out_grad to input_grad directly
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+
+      // init the output grad with 0, because some input elements has no grad
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      // Assign the output_grad to input_grad
+      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
+                     indices, x_grad_data, k);
+    } else {
+      // can not assign grad to input_grad, must do the transpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      framework::DDim trans_dims(out_dims);
+      framework::DDim trans_in_dims(in_dims);
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = out_dims[trans[i]];
+        trans_in_dims[i] = in_dims[trans[i]];
+      }
+      // transpose the out_grad, indices
+      Tensor trans_dO;
+      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // Do transpose
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
+                                                  &trans_dO, trans);
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, *indices, &trans_ind, trans);
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+      // Assign the out_grad to tranpose input_grad
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                                 &trans_dO, &trans_ind, t_out, k);
+
+      // Transpose back
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 9e158abba747d124c83e0366b9c0c5845c49e183..3aa9ff544af63993521d41604cecef0b283ebc1e 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <limits>
 #include <random>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -161,18 +162,27 @@ class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
     std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
                                            1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(engine));
+
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = truncated_normal(dist(gen_engine));
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = truncated_normal(dist(engine));
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index e0c56307639afeb70e5cc45a4022996cef52a475..a4487cde277990a725fd4c37b6d807278e314343 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/uniform_random_op.h"
 #include <string>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+
 namespace paddle {
 namespace operators {
 
@@ -55,19 +57,40 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           "supports SelectedRows and LoDTensor");
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
+    int64_t size = tensor->numel();
     std::uniform_real_distribution<T> dist(
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+    auto gen_ptr = framework::Generator::GetInstance();
+    if (gen_ptr->is_init_py) {
+      std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+      // auto gen_engine = gen_ptr_->GetCPUEngine();
+      // std::uniform_real_distribution<T> dist(
+      //    static_cast<T>(ctx.Attr<float>("min")),
+      //    static_cast<T>(ctx.Attr<float>("max")));
+
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(gen_engine);
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      // std::uniform_real_distribution<T> dist(
+      //    static_cast<T>(ctx.Attr<float>("min")),
+      //    static_cast<T>(ctx.Attr<float>("max")));
+      // int64_t size = tensor->numel();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
     }
+    // std::mt19937_64 &engine = gen_ptr->GetCPUEngine();
+    // auto engine = gen_ptr_->GetCPUEngine();
+
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
     unsigned int diag_step =
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 53c79cf672e7d71ea2e7202f624a0110cc6ce41d..c024bb87b09c00c34dbaaf7b747f29743152502f 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
@@ -87,9 +88,14 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
+    if (framework::Generator::GetInstance()->is_init_py) {
+      seed = static_cast<unsigned int>(
+          framework::Generator::GetInstance()->GetCurrentSeed());
+    } else {
+      if (seed == 0) {
+        std::random_device rd;
+        seed = rd();
+      }
     }
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 867b10441640c63fec9018363a59d29ac52c8743..d263dd03dd0de0d1b12925d0c3ec428b6730ef2e 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index c141033b2b3e6b9fdeac88610dd1362ba8f98428..745102dd28d3d578ec3674221645fc1e8bdfe43a 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -24,17 +25,63 @@ class UniqueOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-
     auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 1,
-        platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                          "But now the dims of Input(X) is %d.",
-                                          in_dims.size()));
+    if (!ctx->Attrs().Get<bool>("is_sorted")) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The Input(X) should be 1-D Tensor, "
+                            "But now the dims of Input(X) is %d.",
+                            in_dims.size()));
+
+      ctx->SetOutputDim("Out", {-1});
+      ctx->SetOutputDim("Index", in_dims);
+      return;
+    }
+
+    bool return_index = ctx->Attrs().Get<bool>("return_index");
+    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
+    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
+    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+
+    if (return_index) {
+      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+    }
+    if (return_inverse) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+    }
+    if (return_counts) {
+      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+    }
 
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
+    if (axis_vec.empty()) {
+      ctx->SetOutputDim("Out", {-1});
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {framework::product(in_dims)});
+      }
+    } else {
+      int axis = axis_vec[0];
+      if (axis < 0) {
+        axis += in_dims.size();
+      }
+      PADDLE_ENFORCE_LT(
+          axis, in_dims.size(),
+          platform::errors::InvalidArgument("The axis(%d) should be less than "
+                                            "the dimension size(%d) of x.",
+                                            axis, in_dims.size()));
+      auto out_dims = in_dims;
+      out_dims[axis] = -1;
+      ctx->SetOutputDim("Out", out_dims);
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {in_dims[axis]});
+      }
+    }
+    if (return_index) {
+      ctx->SetOutputDim("Indices", {-1});
+    }
+    if (return_counts) {
+      ctx->SetOutputDim("Counts", {-1});
+    }
   }
 
  protected:
@@ -49,14 +96,47 @@ class UniqueOp : public framework::OperatorWithKernel {
 class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
+    AddInput("X",
+             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
+             " is fasle or a N-D tensor when Attr(is_sorted) is true.");
     AddAttr<int>("dtype", "data type for output index");
     AddOutput("Out", "A unique subsequence for input tensor.");
     AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and int64 dtype.");
+              "Equivalent to inverse in numpy.unique, "
+              "the indices for where elements in the original input ended up "
+              "in the returned unique tensor.");
+    AddOutput(
+        "Indices",
+        "The indices of the input tensor that result in the unique tensor.")
+        .AsDispensable();
+    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
+    AddAttr<bool>("return_index",
+                  "If True, also return the indices of the input"
+                  " tensor that result in the unique Tensor.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "return_inverse",
+        "If True, also return the indices for where elements"
+        " in the original input ended up in the returned unique tensor.")
+        .SetDefault(false);
+    AddAttr<bool>("return_counts",
+                  "If True, also return the counts for each unique element.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "axis",
+        "The axis to apply unique. If None, the input will be flattened.")
+        .SetDefault({});
+    AddAttr<bool>("is_sorted",
+                  "If True, the unique elements of X are in ascending order."
+                  "Otherwise, the unique elements are not sorted.")
+        .SetDefault(false);
     AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
+    1. Return a unique subsequence for 1-D input tensor, and an index tensor
+    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    means paddle.unique is called.
+    
+    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
+    is true. This means fluid.layers.unique is called.
 )DOC");
   }
 };
@@ -65,6 +145,39 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
-                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
-                       ops::UniqueKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_VERSION(unique)
+    .AddCheckpoint(
+        R"ROC(
+        Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
+        [return_index, return_inverse, return_counts, axis, is_sorted].
+      )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewOutput("Indices",
+                       "The indices of the input tensor that result in the "
+                       "unique tensor.")
+            .NewOutput("Counts", "The counts for each unique element.")
+            .NewAttr("return_index",
+                     "If True, also return the indices of the input"
+                     " tensor that result in the unique Tensor.",
+                     false)
+            .NewAttr("return_inverse",
+                     "If True, also return the indices for where elements"
+                     " in the original input ended up in the returned unique "
+                     "tensor.",
+                     false)
+            .NewAttr("return_counts",
+                     "If True, also return the counts for each unique element.",
+                     false)
+            .NewAttr("axis",
+                     "The axis to apply unique. If None, the input will be "
+                     "flattened.",
+                     {})
+            .NewAttr("is_sorted",
+                     "If True, the unique elements of X are in ascending order."
+                     "Otherwise, the unique elements are not sorted.",
+                     false));
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index cdfd797cbfdf87a42ce0834eea9467010a058431..2bd2a2cbf34c6ccba1e6bfd1892f0f821d0f7c72 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <cmath>
+#include <numeric>
+#include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
@@ -104,17 +109,313 @@ struct UniqueOpFunctor {
   }
 };
 
+static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<framework::Tensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
 template <typename T>
+static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename InT, typename IndexT>
+static void UniqueFlattendTensor(const framework::ExecutionContext& context,
+                                 const framework::Tensor& in,
+                                 framework::Tensor* out, bool return_index,
+                                 bool return_inverse, bool return_counts) {
+  const InT* in_data = in.data<InT>();
+  std::set<InT> unique(in_data, in_data + in.numel());
+  out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto out_data = out->mutable_data<InT>(context.GetPlace());
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    indices->Resize(framework::make_ddim({out->numel()}));
+    auto indices_data = indices->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    inverse->Resize(framework::make_ddim({in.numel()}));
+    auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    count->Resize(framework::make_ddim({out->numel()}));
+    auto count_data = count->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+template <class ForwardIt, typename InT, typename IndexT>
+static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
+                               ForwardIt first, ForwardIt last,
+                               const std::vector<IndexT>& sorted_indices_vec,
+                               std::vector<IndexT>* inverse_vec,
+                               std::vector<IndexT>* counts_vec,
+                               std::vector<IndexT>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<InT>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename DeviceContext, typename InT, typename IndexT>
+static void UniqueDim(const framework::ExecutionContext& context,
+                      const framework::Tensor& in, framework::Tensor* out,
+                      bool return_index, bool return_inverse,
+                      bool return_counts, int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(framework::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  framework::Tensor in_trans;
+  framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  in_trans.mutable_data<InT>(context.GetPlace());
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  TransCompute<DeviceContext, InT>(in.dims().size(), dev_ctx, in, &in_trans,
+                                   permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  framework::DDim in_trans_flat_dims =
+      framework::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const InT* in_trans_data = in_trans.data<InT>();
+  std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                InT lhs = in_trans_data[i + a * col];
+                InT rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+
+  // sort tensor according to indices
+  framework::Tensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  input_sorted.mutable_data<InT>(context.GetPlace());
+  InT* input_sorted_data = input_sorted.data<InT>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
+  }
+
+  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, InT>(
+      context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
+      &inverse_vec, &counts_vec, &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
+  math::ConcatFunctor<DeviceContext, InT> concat_functor;
+  framework::Tensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
+  out_trans.mutable_data<InT>(context.GetPlace());
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(framework::make_ddim(out_trans_dims_vec));
+  out->mutable_data<InT>(context.GetPlace());
+  concat_functor(dev_ctx, input_unbind, 0, &out_trans);
+  TransCompute<DeviceContext, InT>(out_trans.dims().size(), dev_ctx, out_trans,
+                                   out, permute);
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    framework::TensorFromVector(counts_vec, context.device_context(), count);
+  }
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    framework::TensorFromVector(indices_vec, context.device_context(), indices);
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct UniqueFlattendTensorFunctor {
+  const framework::ExecutionContext& ctx_;
+  const framework::Tensor& in_;
+  framework::Tensor* out_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueFlattendTensorFunctor(const framework::ExecutionContext& context,
+                              const framework::Tensor& in,
+                              framework::Tensor* out, bool return_index,
+                              bool return_inverse, bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueFlattendTensor<InT, IndexT>(ctx_, in_, out_, return_index_,
+                                      return_inverse_, return_counts_);
+  }
+};
+
+template <typename DeviceContext, typename InT>
+struct UniqueDimFunctor {
+  const framework::ExecutionContext& ctx_;
+  const framework::Tensor& in_;
+  framework::Tensor* out_;
+  const int axis_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueDimFunctor(const framework::ExecutionContext& context,
+                   const framework::Tensor& in, framework::Tensor* out,
+                   const int axis, bool return_index, bool return_inverse,
+                   bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        axis_(axis),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueDim<DeviceContext, InT, IndexT>(
+        ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_);
+  }
+};
+
+template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+    if (data_type == framework::proto::VarType::INT32) {
+      PADDLE_ENFORCE_LE(
+          x->numel(), INT_MAX,
+          platform::errors::InvalidArgument(
+              "The number of elements in Input(X) should be less than or "
+              "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+              "int64.",
+              x->numel()));
+    }
+    if (!context.Attr<bool>("is_sorted")) {
+      auto* index = context.Output<framework::Tensor>("Index");
 
-    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      return;
+    }
+
+    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
+    bool return_index = context.Attr<bool>("return_index");
+    bool return_inverse = context.Attr<bool>("return_inverse");
+    bool return_counts = context.Attr<bool>("return_counts");
+
+    if (axis_vec.empty()) {
+      framework::VisitDataTypeSmall(
+          data_type,
+          UniqueFlattendTensorFunctor<DeviceContext, T>(
+              context, *x, out, return_index, return_inverse, return_counts));
+    } else {
+      int axis = axis_vec[0];
+      framework::VisitDataTypeSmall(
+          data_type, UniqueDimFunctor<DeviceContext, T>(
+                         context, *x, out, axis, return_index, return_inverse,
+                         return_counts));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/xpu/mul_xpu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79aae71c3045f938f4b8f0d3e05ce7cf358c41ea
--- /dev/null
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
@@ -0,0 +1,183 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    bool trans_a = false;
+    bool trans_b = false;
+    int m = x_matrix.dims()[0];
+    int k = x_matrix.dims()[1];
+    int k1 = y_matrix.dims()[0];
+    int n = y_matrix.dims()[1];
+    PADDLE_ENFORCE_EQ(
+        k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    const T* data_a = x_matrix.data<T>();
+    const T* data_b = y_matrix.data<T>();
+    T* data_c = z->data<T>();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
+                            alpha, data_a, data_b, beta, data_c);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    Tensor dout_mat;
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+      bool trans_a = false;
+      bool trans_b = true;
+      int m = dout_mat.dims()[0];
+      int k = dout_mat.dims()[1];
+      int n = y_matrix.dims()[0];
+      int k1 = y_matrix.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = dout->data<T>();
+      const T* data_b = y_matrix.data<T>();
+      T* data_c = dx_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+      bool trans_a = true;
+      bool trans_b = false;
+      int k = x_matrix.dims()[0];
+      int m = x_matrix.dims()[1];
+      int k1 = dout_mat.dims()[0];
+      int n = dout_mat.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = x_matrix.data<T>();
+      const T* data_b = dout->data<T>();
+      T* data_c = dy_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
+#endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 5a100c5746e616e860811dd47da27036ea7355d5..652b4dd47daa8aecdcae43e8c910d7dd61bbb64d 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -4,6 +4,12 @@ if(WITH_GPU)
   proto_library(cuda_error_proto SRCS cuda_error.proto)
 endif(WITH_GPU)
 
+if(WITH_XPU)
+  set(XPU_CTX_DEPS xpulib)
+ELSE()
+  set(XPU_CTX_DEPS)
+endif(WITH_XPU)
+
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -45,11 +51,15 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
+if(WITH_XPU)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
@@ -78,13 +88,17 @@ ELSE()
   set(STREAM_CALLBACK_DEPS)
 ENDIF()
 
+if(WITH_GLOO)
+    cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce)
+endif()
+
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper)
+    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 38b0894c3f71dc150a9ed737b0ac17b22baffb8a..29982c13c8ca88bc8b4a168f92e4116a283a97e8 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU option or check that your train process hold the "
+        "with WITH_GPU or WITH_XPU option or check that your train process "
+        "hold the "
         "correct gpu_id if you use Executor.",
         place));
   }
@@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported. Please re-compile with WITH_GPU "
           "option."));
+#endif
+    } else if (platform::is_xpu_place(p)) {
+#ifdef PADDLE_WITH_XPU
+      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("XPUPlace is not supported. Please "
+                                          "re-compile with WITH_XPU option."));
 #endif
     }
   }
@@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
+#ifdef PADDLE_WITH_XPU
+XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+
+XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  context_ = xpu::create_context();
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+void XPUDeviceContext::Wait() const {
+  int ret = xpu_set_device(place_.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_wait();
+}
+
+Place XPUDeviceContext::GetPlace() const { return place_; }
+
+xpu::Context* XPUDeviceContext::x_context() const { return context_; }
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
@@ -412,9 +464,21 @@ MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
   return cur_paddle_data_layout;
 }
 
-void MKLDNNDeviceContext::ResetBlobMap() const {
-  VLOG(3) << "Clearing DNNL cache.";
-  p_blobmap_->clear();
+void MKLDNNDeviceContext::ResetBlobMap() {
+  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+  if (!block_next_cache_clearing_) {
+    VLOG(3) << "Clearing DNNL cache.";
+    p_blobmap_->clear();
+  } else {
+    VLOG(3) << "Prevented Clearing DNNL cache.";
+    block_next_cache_clearing_ = false;
+  }
+}
+
+void MKLDNNDeviceContext::BlockNextCacheClearing() {
+  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+  VLOG(3) << "Next DNNL cache clearing has been blocked.";
+  block_next_cache_clearing_ = true;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 7511edb9ccf2c6ca1d5aea2964799b8be08064b6..8bfdfc8a1c6033a79c197e1cd425197f77079bda 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -43,6 +43,10 @@ limitations under the License. */
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -76,6 +80,35 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
   using TYPE = CPUDeviceContext;
 };
 
+#ifdef PADDLE_WITH_XPU
+class XPUDeviceContext : public DeviceContext {
+ public:
+  XPUDeviceContext();
+  explicit XPUDeviceContext(XPUPlace place);
+  virtual ~XPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  xpu::Context* x_context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+ private:
+  XPUPlace place_;
+  xpu::Context* context_;
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in XPU
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::XPUPlace> {
+  using TYPE = XPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice;
@@ -487,7 +520,10 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   const mkldnn::engine& GetEngine() const { return engine_; }
 
   // Remove all entries from the blob map
-  void ResetBlobMap() const;
+  void ResetBlobMap();
+
+  // Prevent next ResetBlobMap()
+  void BlockNextCacheClearing();
 
   // Get the ShapeBlob size in cur_mkldnn_session_id.
   size_t GetShapeBlobSize() const;
@@ -506,6 +542,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   mkldnn::engine engine_;
   std::shared_ptr<BlobMap> p_blobmap_;
   std::shared_ptr<std::mutex> p_mutex_;
+  bool block_next_cache_clearing_ = false;
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3de2e3957a990a254ffb762f996876a122a865bc
--- /dev/null
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/device_context.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::XPUPlace;
+
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    XPUDeviceContext* device_context = new XPUDeviceContext(XPUPlace(i));
+    xpu::Context* ctx = device_context->x_context();
+    ASSERT_NE(nullptr, ctx);
+    delete device_context;
+  }
+}
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::XPUPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> xpu_places;
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(XPUPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 0eb28f0c0c3561f98891ff2a0ab5a26a20b07fb4..ebeb14e940e5fd904e506bca565c4aeae84c93cf 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -100,6 +100,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnCreateDropoutDescriptor);                  \
   __macro(cudnnDropoutGetStatesSize);                     \
   __macro(cudnnSetDropoutDescriptor);                     \
+  __macro(cudnnRestoreDropoutDescriptor);                 \
   __macro(cudnnCreateRNNDescriptor);                      \
   __macro(cudnnGetRNNParamsSize);                         \
   __macro(cudnnGetRNNWorkspaceSize);                      \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 5b612677da3554f17ab3ac29ddc241eee5f7c768..ce1ec507307a2721e641ac15425c6a2321e514c7 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -266,7 +266,7 @@ inline std::string GetErrorSumaryString(StrType&& what, const char* file,
   std::ostringstream sout;
   sout << "\n----------------------\nError Message "
           "Summary:\n----------------------\n";
-  sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
+  sout << string::Sprintf("%s (at %s:%d)", std::forward<StrType>(what), file,
                           line)
        << std::endl;
   return sout.str();
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 8667375c6f2726f1099c6e57c6e793252b01d454..af8798a4b7cf5a8832ce9345cad45ce3096484e4 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -508,3 +508,16 @@ DEFINE_int32(
     "summary will be shown."
     "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
     "error message summary will be shown.");
+
+/**
+ * Debug related FLAG
+ * Name: sort_sum_gradient
+ * Since Version: 2.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, gradients are summed by the reverse order of
+ * the forward execution sequence.
+ */
+DEFINE_bool(sort_sum_gradient, false,
+            "Sum gradients by the reverse order of "
+            "the forward execution sequence.");
diff --git a/paddle/fluid/platform/gloo_context.cc b/paddle/fluid/platform/gloo_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32e7299d319c91891c7c05dd1e8cfa85e99a0422
--- /dev/null
+++ b/paddle/fluid/platform/gloo_context.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/gloo_context.h"
+
+namespace paddle {
+namespace platform {
+#if defined(PADDLE_WITH_GLOO)
+void GlooParallelContext::Init() {
+  auto gloo_ptr = paddle::framework::GlooWrapper::GetInstance();
+  gloo_ptr->SetRank(strategy_.rank);
+  gloo_ptr->SetSize(strategy_.rank_num);
+  gloo_ptr->SetPrefix(strategy_.prefix);
+  gloo_ptr->SetIface(strategy_.iface);
+  gloo_ptr->SetTimeoutSeconds(strategy_.init_seconds, strategy_.run_seconds);
+  gloo_ptr->SetHdfsStore(strategy_.path, strategy_.fs_name, strategy_.fs_ugi);
+  gloo_ptr->Init();
+}
+#endif
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/gloo_context.h b/paddle/fluid/platform/gloo_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7dcf288a22c71c29fc22ec5e131249662214b8d
--- /dev/null
+++ b/paddle/fluid/platform/gloo_context.h
@@ -0,0 +1,51 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+
+namespace paddle {
+namespace platform {
+
+#if defined(PADDLE_WITH_GLOO)
+struct GlooParallelStrategy {
+  int rank{0};
+  int rank_num{1};
+  std::string iface;
+  std::string prefix;
+  int init_seconds{9999999};
+  int run_seconds{9999999};
+  std::string path;
+  std::string fs_name;
+  std::string fs_ugi;
+};
+
+class GlooParallelContext {
+ public:
+  explicit GlooParallelContext(const GlooParallelStrategy& strategy)
+      : strategy_(strategy) {}
+
+  virtual ~GlooParallelContext() {}
+
+  virtual void Init();
+
+ protected:
+  GlooParallelStrategy strategy_;
+};
+#endif
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 5f63233d8bee4beefd6e1695d8bc3d6e5e4ae7fb..ca1e5501c6a84e6136c28f564a78a7e63f0ee8d4 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
@@ -38,11 +39,11 @@ USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
 
-/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
-faster way to query device properties. You can see details in
-https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-*/
+int CudnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
 
+  return dynload::cudnnGetVersion();
+}
 static int GetCUDADeviceCountImpl() {
   int driverVersion = 0;
   cudaError_t status = cudaDriverGetVersion(&driverVersion);
@@ -73,6 +74,10 @@ int GetCUDADeviceCount() {
   return dev_cnt;
 }
 
+/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
+faster way to query device properties. You can see details in
+https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
+*/
 int GetCUDAComputeCapability(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 6a9893647172e2c63f4749fdb0ae1cb0fdfaaf04..ec77447ef77dbb1cd7ee180176f95a9ab8f7c03a 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,6 +23,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
+//! Get the version of cudnn
+int CudnnVersion();
 
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 261f6e807a22d328a20156bed8ee9974637898c3..2e708e44fd0e49e1c33e048084d15e13c6e4d57e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -33,6 +33,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 DECLARE_int32(paddle_num_threads);
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
              "Multiple of the CUPTI device buffer size. If the timestamps have "
@@ -151,6 +156,14 @@ void InitDevices(bool init_p2p) {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetXPUSelectedDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
+  }
 #endif
   InitDevices(init_p2p, devices);
 }
@@ -165,7 +178,13 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
       LOG(WARNING) << "Invalid devices id.";
       continue;
     }
+
+#ifdef PADDLE_WITH_CUDA
     places.emplace_back(platform::CUDAPlace(devices[i]));
+#endif
+#ifdef PADDLE_WITH_XPU
+    places.emplace_back(platform::XPUPlace(devices[i]));
+#endif
   }
   if (init_p2p) {
     InitP2P(devices);
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 6392c4f4c42af9030e9dd0b3373df60938a4676f..f14fbdd74f95bfbed53ff787af861ce4656159c0 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -20,7 +20,7 @@ TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
   InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
@@ -39,6 +39,18 @@ TEST(InitDevices, CUDA) {
 #endif
 }
 
+TEST(InitDevices, XPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_XPU
+  int count = paddle::platform::GetXPUDeviceCount();
+  InitDevices(true);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
+
 #ifndef _WIN32
 TEST(SignalHandle, SignalHandle) {
   std::string msg = "Signal raises";
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index c74c47b7d84820f089d4e657f8bddccc5de8d727..3782eb684f21f8c09e9dac124082ae596fe5d1bc 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -129,6 +129,16 @@ inline void ClearMKLDNNCache(const platform::Place& place) {
   }
 }
 
+inline void DontClearMKLDNNCache(const platform::Place& place) {
+  // Clear mkl-dnn cache,
+  if (platform::is_cpu_place(place)) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::MKLDNNDeviceContext* dev_ctx =
+        (platform::MKLDNNDeviceContext*)pool.Get(place);
+    dev_ctx->BlockNextCacheClearing();
+  }
+}
+
 template <typename Type>
 mkldnn::memory::data_type MKLDNNGetDataType() {
   return mkldnn::memory::data_type::undef;
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 5d7143f56b3f394bb1a99c1b3802b7c20138dfb7..d1c5480c0f5438826e6eb5cc0de211ee1af74cf7 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -82,17 +82,21 @@ class MKLDNNHandlerT {
         fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
   }
 
+  template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
-    T* ptr = output->mutable_data<T>(place_, fwd_pd_->dst_desc().get_size());
+    T_out* ptr =
+        output->mutable_data<T_out>(place_, fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr,
                                             "@dst_mem_p");
   }
 
+  template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const framework::Tensor* output) {
-    const T* output_data = output->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        bwd_pd_->dst_desc(), to_void_cast<T>(output_data), "@bwd-dst_mem_p");
+    const T_out* output_data = output->data<T_out>();
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
+                                            to_void_cast<T_out>(output_data),
+                                            "@bwd-dst_mem_p");
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 195acc1b6d15a91369d48164179cd6e0b5cfac8d..b80d2fd1632cd82c231fae724fc4d754b8fed0fc 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -32,6 +32,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const CUDAPlace &p) {
     os_ << "CUDAPlace(" << p.device << ")";
   }
+  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -44,6 +45,10 @@ bool is_gpu_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
+bool is_xpu_place(const Place &p) {
+  return boost::apply_visitor(IsXPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -60,6 +65,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
     if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
+    } else if (is_xpu_place(p1)) {
+      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index eeda10a633b655dee0da9197888738cd94b50809..f95f6954a32e771e7413a766afcfea8b85ff1f7e 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -58,31 +58,58 @@ struct CUDAPinnedPlace {
   inline bool operator<(const CUDAPinnedPlace &) const { return false; }
 };
 
+// Place for Baidu Kunlun Accelerator
+struct XPUPlace {
+  XPUPlace() : XPUPlace(0) {}
+  explicit XPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const XPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const XPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const XPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &gpu) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
 
-class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsXPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place
+    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
  private:
-  using PlaceBase = boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase =
+      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
+  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -98,6 +125,7 @@ class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
 using PlaceList = std::vector<Place>;
 
 bool is_gpu_place(const Place &);
+bool is_xpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -115,6 +143,16 @@ struct PlaceVisitorWrapper
     return visitor_(cpu);
   }
 
+  typename Visitor::result_type operator()(const XPUPlace &xpu) const {
+#ifdef PADDLE_WITH_XPU
+    return visitor_(xpu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with XPU. Cannot visit xpu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #ifdef PADDLE_WITH_CUDA
     return visitor_(cuda);
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
index e4c1d3def90f191194b46bb9ea27dd27d69dcb8b..13f28c73f4504aea85d6155a3daa8f8f01b26385 100644
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -18,19 +18,32 @@
 TEST(Place, Equality) {
   paddle::platform::CPUPlace cpu;
   paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
+  paddle::platform::XPUPlace x0(0), x1(1), xx0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
   EXPECT_EQ(g1, g1);
   EXPECT_EQ(g0, gg0);
+  EXPECT_EQ(x0, x0);
+  EXPECT_EQ(x1, x1);
+  EXPECT_EQ(x0, xx0);
 
   EXPECT_NE(g0, g1);
+  EXPECT_NE(x0, x1);
 
   EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
   EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(x0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, x0));
 }
 
 TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::XPUPlace(1);
+    EXPECT_EQ("XPUPlace(1)", ss.str());
+  }
   {
     std::stringstream ss;
     ss << paddle::platform::CUDAPlace(1);
diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8c5f85f9cfe4b9d6ac07069fff89d37c695af5b
--- /dev/null
+++ b/paddle/fluid/platform/xpu_header.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include "xpu/api.h"
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+
+namespace xpu = baidu::xpu::api;
+#endif
diff --git a/paddle/fluid/platform/xpu_info.cc b/paddle/fluid/platform/xpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f88248fda7e65e1b96448c0576880a18a9d8a4a9
--- /dev/null
+++ b/paddle/fluid/platform/xpu_info.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/string/split.h"
+
+DEFINE_string(selected_xpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (XPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
+namespace paddle {
+namespace platform {
+
+static int GetXPUDeviceCountImpl() {
+  const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  int ret = xpu_device_count(&count);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetXPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id, GetXPUDeviceCount(),
+      platform::errors::InvalidArgument("id must less than XPU count"));
+  int ret = xpu_set_device(id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/xpu_info.h b/paddle/fluid/platform/xpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..efaba13453e7472ed09ff66c70bdaf19eb89549d
--- /dev/null
+++ b/paddle/fluid/platform/xpu_info.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <vector>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b5165078cb17fe404d7a12230f02283b41391a3f..d733cf26ed209bcb86eaf2d366e45cfa0e7f9a90 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
-  gloo_wrapper infer_io_utils heter_wrapper)
+  gloo_wrapper infer_io_utils heter_wrapper generator)
 
 if (WITH_NCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
@@ -37,7 +37,13 @@ set(PYBIND_SRCS
   data_set_py.cc
   imperative.cc
   ir.cc
-  inference_api.cc)
+  inference_api.cc
+  generator_py.cc)
+
+if(WITH_GLOO)
+  set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
+  set(PYBIND_SRCS ${PYBIND_SRCS} gloo_context_py.cc)
+endif(WITH_GLOO)
 
 if (WITH_CRYPTO)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_crypto)
@@ -71,13 +77,23 @@ if(WITH_PYTHON)
   set(tmp_impl_file ${impl_file}.tmp)
 
   if(WIN32)
-    add_custom_command(TARGET op_function_generator
-          POST_BUILD
-          COMMAND "${CMAKE_BINARY_DIR}/paddle/fluid/pybind/${CMAKE_BUILD_TYPE}/op_function_generator"
-              "${tmp_impl_file}"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${impl_file}"
-          VERBATIM
+    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
+    "set build_times=1\n"
+    ":retry\n"
+    "ECHO op_function_generator run %build_times% time\n"
+    "${CMAKE_BINARY_DIR}/paddle/fluid/pybind/${CMAKE_BUILD_TYPE}/op_function_generator ${impl_file}\n"
+    "if %ERRORLEVEL% NEQ 0 (\n"
+    "    set /a build_times=%build_times%+1\n"
+    "    if %build_times% GTR 100 (\n"
+    "        exit /b 1\n"
+    "    ) else (\n"
+    "        goto :retry\n"
+    "    )\n"
+    ")\n"
+    "exit /b 0")
+
+    add_custom_command(TARGET op_function_generator POST_BUILD
+          COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
     )
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bccd5fb2dd92298323381c09467937abd87a53c
--- /dev/null
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/pybind/generator_py.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+void BindGenerator(py::module* m) {
+  py::class_<framework::GeneratorState>(*m, "GeneratorState", "");
+  py::class_<std::mt19937_64>(*m, "mt19937_64", "");
+  py::class_<framework::Generator, std::shared_ptr<framework::Generator>>(
+      *m, "Generator")
+      .def(py::init([]() { return framework::Generator::GetInstanceX(); }),
+           py::return_value_policy::reference)
+      .def("get_state", &framework::Generator::GetState,
+           py::return_value_policy::move)
+      .def("set_state", &framework::Generator::SetState)
+      .def("manual_seed", &framework::Generator::SetCurrentSeed)
+      .def("seed", &framework::Generator::Seed)
+      .def("initial_seed", &framework::Generator::GetCurrentSeed)
+      .def("random", &framework::Generator::Random64)
+      .def("get_cpu_engine", &framework::Generator::GetCPUEngine,
+           py::return_value_policy::move)
+      .def("set_cpu_engine", &framework::Generator::SetCPUEngine);
+}  // end Generator
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/pybind/generator_py.h
similarity index 57%
rename from paddle/fluid/imperative/backward_strategy.h
rename to paddle/fluid/pybind/generator_py.h
index 0f04d6db8e63d5d069745ed1895df774e69d60d0..d37654c1ba24e296fb325d1507187c5a954754bd 100644
--- a/paddle/fluid/imperative/backward_strategy.h
+++ b/paddle/fluid/pybind/generator_py.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//
-// Created by Jiabin on 2019-04-25.
-//
 #pragma once
 
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
 namespace paddle {
-namespace imperative {
-namespace detail {
+namespace pybind {
 
-struct BackwardStrategy {
-  /* DyGraph now support two kinds of backward strategy, one is sorted sum
-   * gradient, another is sum gradient once they are created */
-  // TODO(jiabin): add more Strategy when we support
-  bool sorted_sum_gradient_{false};
-};
+void BindGenerator(py::module* m);
 
-}  // namespace detail
-}  // namespace imperative
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index deca9625e63d05625c407a1282b396398bb78ccc..f1084018d9c79e46c33098dafdb48dc395dac652 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
+DECLARE_bool(sort_sum_gradient);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
       FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
       FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
       FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
       FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c77b0c3a06ca9a17f33643d88ddf932c32544
--- /dev/null
+++ b/paddle/fluid/pybind/gloo_context_py.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/gloo_context_py.h"
+
+#include <Python.h>
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/platform/gloo_context.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+// Bind Methods
+void BindGlooContext(py::module *m) {
+// define parallel context for gloo
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<platform::GlooParallelStrategy> gloo_parallel_strategy(
+      *m, "GlooParallelStrategy", "");
+  gloo_parallel_strategy.def(py::init())
+      .def_property("rank_num",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.rank_num;
+                    },
+                    [](platform::GlooParallelStrategy &self, int nranks) {
+                      self.rank_num = nranks;
+                    })
+      .def_property(
+          "rank",
+          [](const platform::GlooParallelStrategy &self) { return self.rank; },
+          [](platform::GlooParallelStrategy &self, int rank) {
+            self.rank = rank;
+          })
+      .def_property(
+          "iface",
+          [](const platform::GlooParallelStrategy &self) { return self.iface; },
+          [](platform::GlooParallelStrategy &self, const std::string &iface) {
+            self.iface = iface;
+          })
+      .def_property("prefix",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.prefix;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &prefix) { self.prefix = prefix; })
+      .def_property("init_seconds",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.init_seconds;
+                    },
+                    [](platform::GlooParallelStrategy &self, int init_seconds) {
+                      self.init_seconds = init_seconds;
+                    })
+      .def_property("run_seconds",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.run_seconds;
+                    },
+                    [](platform::GlooParallelStrategy &self, int run_seconds) {
+                      self.run_seconds = run_seconds;
+                    })
+      .def_property(
+          "path",
+          [](const platform::GlooParallelStrategy &self) { return self.path; },
+          [](platform::GlooParallelStrategy &self, const std::string &path) {
+            self.path = path;
+          })
+      .def_property("fs_name",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.fs_name;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &fs_name) { self.fs_name = fs_name; })
+      .def_property("fs_ugi",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.fs_ugi;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &fs_ugi) { self.fs_ugi = fs_ugi; });
+
+  py::class_<platform::GlooParallelContext> gloo_ctx(*m, "GlooParallelContext");
+  gloo_ctx.def(py::init<const platform::GlooParallelStrategy &>())
+      .def("init", [](platform::GlooParallelContext &self) { self.Init(); });
+#endif
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/gloo_context_py.h b/paddle/fluid/pybind/gloo_context_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..89bd183097b7541c33a797f27178bafb934bcd52
--- /dev/null
+++ b/paddle/fluid/pybind/gloo_context_py.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindGlooContext(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 82941c58280560b1c09b149da01ef3d6e8a3f8e0..489dd198876204486fc94518fbef0c806d0543d4 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -19,14 +19,17 @@ limitations under the License. */
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
+
 #include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/imperative/all_reduce.h"
-#include "paddle/fluid/imperative/backward_strategy.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -62,11 +65,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::CPUPlace>();
   } else if (py::isinstance<platform::CUDAPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPlace>();
+  } else if (py::isinstance<platform::XPUPlace>(place_obj)) {
+    return place_obj.cast<platform::XPUPlace>();
   } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPinnedPlace>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
   }
 }
 
@@ -74,16 +79,23 @@ static void InitTensorForVarBase(imperative::VarBase *self,
                                  const py::array &array,
                                  const platform::Place place,
                                  bool persistable = false,
-                                 bool zero_copy = false,
-                                 std::string name = "") {
+                                 bool zero_copy = false, std::string name = "",
+                                 int stop_gradient = -1) {
   if (name == "") {
-    name = imperative::GetCurrentTracer()->GenerateUniqueName("generated_var");
+    name =
+        imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor");
   }
+  VLOG(5) << "Init Tensor as: / name: " << name
+          << " / persistable: " << persistable << " / zero_copy: " << zero_copy
+          << " / stop_gradient: " << stop_gradient;
   new (self) imperative::VarBase(name);
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
   if (platform::is_cpu_place(place)) {
     SetTensorFromPyArray<platform::CPUPlace>(
         tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
+  } else if (platform::is_xpu_place(place)) {
+    SetTensorFromPyArray<platform::XPUPlace>(
+        tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy);
   } else if (platform::is_gpu_place(place)) {
     SetTensorFromPyArray<platform::CUDAPlace>(
         tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy);
@@ -93,7 +105,10 @@ static void InitTensorForVarBase(imperative::VarBase *self,
         zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
+  }
+  if (stop_gradient != -1) {
+    self->SetOverridedStopGradient(stop_gradient);
   }
   self->SetPersistable(persistable);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -102,12 +117,11 @@ static void InitTensorForVarBase(imperative::VarBase *self,
 
 static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
                                            const py::kwargs &kwargs) {
-  VLOG(4) << "Init VarBase";
+  VLOG(4) << "Init VarBase from kwargs: ";
   PADDLE_ENFORCE_EQ(
       kwargs.contains("value"), true,
       platform::errors::NotFound(
           "The kwargs used to create Varbase misses argument: value"));
-
   auto persistable = kwargs.contains("persistable")
                          ? kwargs["persistable"].cast<bool>()
                          : false;
@@ -116,10 +130,14 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
   auto zero_copy =
       kwargs.contains("zero_copy") ? kwargs["zero_copy"].cast<bool>() : false;
   auto name = kwargs.contains("name") ? kwargs["name"].cast<std::string>() : "";
+  auto stop_gradient = kwargs.contains("stop_gradient")
+                           ? kwargs["stop_gradient"].cast<int>()
+                           : -1;
   auto default_place = imperative::GetCurrentTracer()->ExpectedPlace();
   auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"])
                                         : default_place;
-  InitTensorForVarBase(self, array, place, persistable, zero_copy, name);
+  InitTensorForVarBase(self, array, place, persistable, zero_copy, name,
+                       stop_gradient);
 }
 
 template <typename P>
@@ -127,15 +145,24 @@ static void InitVarBaseFromNumpyWithArg(imperative::VarBase *self,
                                         const py::array &array, const P &place,
                                         bool persistable = false,
                                         bool zero_copy = false,
-                                        std::string name = "") {
-  VLOG(4) << "Init VarBase";
-  // 0: self, 1: value, 2: place, 3: persistable, 4: zero_copy, 5: name
+                                        std::string name = "",
+                                        int stop_gradient = -1) {
+  VLOG(4) << "Init VarBase from Arg: ";
+  // 0: self, 1: value, 2: place, 3: persistable, 4: zero_copy, 5: name , 6:
+  // stop_gradient
   if (name == "") {
-    name = imperative::GetCurrentTracer()->GenerateUniqueName("generated_var");
+    name =
+        imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor");
   }
+  VLOG(5) << "Init Tensor as: / name: " << name
+          << " / persistable: " << persistable << " / zero_copy: " << zero_copy
+          << " / stop_gradient: " << stop_gradient;
   new (self) imperative::VarBase(name);
   self->SetPersistable(persistable);
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
+  if (stop_gradient != -1) {
+    self->SetOverridedStopGradient(stop_gradient);
+  }
   SetTensorFromPyArray<P>(tensor, array, place, zero_copy);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor->type());
@@ -143,7 +170,7 @@ static void InitVarBaseFromNumpyWithArg(imperative::VarBase *self,
 
 static void InitVarBaseFromNumpyWithArgDefault(imperative::VarBase *self,
                                                const py::array &array) {
-  VLOG(4) << "Init VarBase";
+  VLOG(4) << "Init VarBase from numpy: ";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
   InitTensorForVarBase(self, array, place);
 }
@@ -153,7 +180,7 @@ static void InitVarBaseFromTensorWithArgDefault(
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
   new (self) imperative::VarBase(
-      imperative::GetCurrentTracer()->GenerateUniqueName("generated_var"));
+      imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor"));
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor.type());
@@ -479,50 +506,6 @@ void BindImperative(py::module *m_ptr) {
         []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
 #endif
 
-  py::class_<imperative::detail::BackwardStrategy> backward_strategy(
-      m, "BackwardStrategy", R"DOC(
-
-    BackwardStrategy is a descriptor of how to run the backward process.
-
-    **Note**:
-        **This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
-
-    Attribute:
-        **sort_sum_gradient**:
-
-        If framework will sum the gradient by the reverse order of trace. eg. x_var ( :ref:`api_guide_Variable` ) will be the input of multiple OP such as :ref:`api_fluid_layers_scale` , this attr will decide if framework will sum gradient of `x_var` by the reverse order.
-
-        By Default: False
-
-        Examples:
-            .. code-block:: python
-
-                import numpy as np
-                import paddle.fluid as fluid
-
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    x_var = fluid.dygraph.to_variable(x)
-                    sums_inputs = []
-                    # x_var will be multi-scales' input here
-                    for _ in range(10):
-                        sums_inputs.append(fluid.layers.scale(x_var))
-                    ret2 = fluid.layers.sums(sums_inputs)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
-      )DOC");
-  backward_strategy.def(py::init())
-      .def_property("sort_sum_gradient",
-                    [](const imperative::detail::BackwardStrategy &self) {
-                      return self.sorted_sum_gradient_;
-                    },
-                    [](imperative::detail::BackwardStrategy &self,
-                       bool sorted_sum_gradient) {
-                      self.sorted_sum_gradient_ = sorted_sum_gradient;
-                    });
-
   m.def("start_imperative_gperf_profiler",
         []() { imperative::StartProfile(); });
 
@@ -537,8 +520,7 @@ void BindImperative(py::module *m_ptr) {
         });
 
   py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
-      m, "VarBase",
-      R"DOC()DOC")
+      m, "VarBase", R"DOC()DOC")
       .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
       .def("__init__",
            [](imperative::VarBase &self, framework::proto::VarType::Type dtype,
@@ -548,7 +530,7 @@ void BindImperative(py::module *m_ptr) {
              std::string act_name = "";
              if (!name.ptr() || name.ptr() == Py_None) {
                act_name = imperative::GetCurrentTracer()->GenerateUniqueName(
-                   "generated_var");
+                   "generated_tensor");
              } else {
                act_name = name.cast<std::string>();
              }
@@ -564,13 +546,20 @@ void BindImperative(py::module *m_ptr) {
            })
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CPUPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "")
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "")
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPinnedPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "")
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
       .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"))
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
@@ -711,21 +700,18 @@ void BindImperative(py::module *m_ptr) {
                          inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
                     loss2.clear_gradient()
                     print("After clear {}".format(loss2.gradient()))
       )DOC")
       .def("_run_backward",
-           [](imperative::VarBase &self,
-              const imperative::detail::BackwardStrategy &bckst,
-              const imperative::Tracer &tracer, bool retain_graph) {
+           [](imperative::VarBase &self, const imperative::Tracer &tracer,
+              bool retain_graph) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, bckst, retain_graph);
+             engine->Init(&self, retain_graph);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
@@ -793,6 +779,15 @@ void BindImperative(py::module *m_ptr) {
            [](const imperative::VarBase &self, const platform::CPUPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const imperative::VarBase &self,
+              const platform::CUDAPinnedPlace &place,
+              bool blocking) { return self.NewVarBase(place, blocking); },
+           py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::XPUPlace &place,
+              bool blocking) { return self.NewVarBase(place, blocking); },
+           py::return_value_policy::copy)
       .def("_copy_to",
            [](const imperative::VarBase &self, const platform::CUDAPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
@@ -821,6 +816,9 @@ void BindImperative(py::module *m_ptr) {
               return std::vector<int>();
             }
           })
+      .def_property_readonly(
+          "place", [](imperative::VarBase &self) { return self.Place(); },
+          py::return_value_policy::copy)
       .def_property_readonly("type", &imperative::VarBase::Type)
       .def_property_readonly("dtype", &imperative::VarBase::DataType);
 
@@ -838,13 +836,14 @@ void BindImperative(py::module *m_ptr) {
       .def("reset", &imperative::jit::ProgramDescTracer::Reset);
 
   py::class_<imperative::Tracer, std::shared_ptr<imperative::Tracer>>(
-      m, "Tracer",
-      R"DOC()DOC")
+      m, "Tracer", R"DOC()DOC")
       .def("__init__",
            [](imperative::Tracer &self) { new (&self) imperative::Tracer(); })
       .def_property("_enable_program_desc_tracing",
                     &imperative::Tracer::IsProgramDescTracingEnabled,
                     &imperative::Tracer::SetEnableProgramDescTracing)
+      .def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled,
+                    &imperative::Tracer::SetEnableAutoCast)
       .def_property("_train_mode", &imperative::Tracer::HasGrad,
                     &imperative::Tracer::SetHasGrad)
       .def_property(
@@ -856,6 +855,9 @@ void BindImperative(py::module *m_ptr) {
             if (py::isinstance<platform::CUDAPlace>(obj)) {
               auto p = obj.cast<platform::CUDAPlace *>();
               self.SetExpectedPlace(*p);
+            } else if (py::isinstance<platform::XPUPlace>(obj)) {
+              auto p = obj.cast<platform::XPUPlace *>();
+              self.SetExpectedPlace(*p);
             } else if (py::isinstance<platform::CPUPlace>(obj)) {
               auto p = obj.cast<platform::CPUPlace *>();
               self.SetExpectedPlace(*p);
@@ -864,7 +866,8 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
             } else {
               PADDLE_THROW(platform::errors::InvalidArgument(
-                  "Incompatible Place Type: supports CUDAPlace, CPUPlace, "
+                  "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
+                  "CPUPlace, "
                   "and CUDAPinnedPlace, "
                   "but got Unknown Type!"));
             }
@@ -874,6 +877,39 @@ void BindImperative(py::module *m_ptr) {
            py::return_value_policy::reference)
       .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName,
            py::arg("key") = "eager_tmp")
+      .def(
+          "_set_amp_op_list",
+          [](imperative::Tracer &self,
+             std::unordered_set<std::string> &allow_ops,
+             std::unordered_set<std::string> &block_ops) {
+            // NOTE(zhiqiu): The automatic conversion in pybind11 between c++
+            // STL and python set/list/dict involve a copy operation that
+            // prevents pass-by-reference semantics, so it is ok to swap.
+            // The reaseon why not directly pass
+            // std::shared_ptr<std::unordered_set<std::string>>
+            // is that pybind11 forbid shared_ptr<T> where T is not custom type.
+            imperative::AmpOperators::Instance().GetAllowOps()->swap(allow_ops);
+            imperative::AmpOperators::Instance().GetBlockOps()->swap(block_ops);
+          })
+      .def("_get_amp_op_list",
+           [](imperative::Tracer &self) {
+             return std::make_tuple(
+                 *(imperative::AmpOperators::Instance().GetAllowOps()),
+                 *(imperative::AmpOperators::Instance().GetBlockOps()));
+           })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::XPUPlace &place,
+              bool trace_backward) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
+                            std::move(attrs), place, trace_backward);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
@@ -940,13 +976,11 @@ void BindImperative(py::module *m_ptr) {
              &output_targets,
          const std::vector<std::shared_ptr<imperative::VarBase>> &output_grads,
          const std::vector<std::shared_ptr<imperative::VarBase>> &no_grad_vars,
-         const platform::Place &place,
-         const imperative::detail::BackwardStrategy &strategy,
-         bool create_graph, bool retain_graph, bool allow_unused,
-         bool only_inputs) {
+         const platform::Place &place, bool create_graph, bool retain_graph,
+         bool allow_unused, bool only_inputs) {
         imperative::PartialGradEngine engine(
             input_targets, output_targets, output_grads, no_grad_vars, place,
-            strategy, create_graph, retain_graph, allow_unused, only_inputs);
+            create_graph, retain_graph, allow_unused, only_inputs);
         engine.Execute();
         return engine.GetResult();
       },
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 696da67c9c98fe16b28ceb05d5c07049104fd43b..cf0dac022f74e47261fc28d02665bcde49dc8b39 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -448,6 +448,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::cpu_math_library_num_threads)
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
       .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
 #ifdef PADDLE_WITH_MKLDNN
       .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
            py::return_value_policy::reference)
@@ -565,6 +566,7 @@ void BindPaddlePassBuilder(py::module *m) {
       .def("enable_cudnn", &PassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &PassStrategy::EnableMKLDNN)
       .def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
       .def("use_gpu", &PassStrategy::use_gpu);
 
   py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
@@ -572,14 +574,16 @@ void BindPaddlePassBuilder(py::module *m) {
       .def(py::init<const CpuPassStrategy &>())
       .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
 
   py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
       .def(py::init<>())
       .def(py::init<const GpuPassStrategy &>())
       .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
 }
 }  // namespace
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 597ead9327e233df785b58437afce8fa75a058c3..70b321f658cd2cf1bd43cd6440bf83e1f4dab140 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -18,9 +18,11 @@
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/variable.h"
@@ -31,15 +33,93 @@
 namespace py = pybind11;
 namespace paddle {
 namespace pybind {
-static inline void ConstructAttrMapFromPyArgs(framework::AttributeMap* attrs,
+
+static inline std::shared_ptr<imperative::VarBase> CastPyHandleToVarBase(
+    const std::string& op_type, const std::string& arg_name, int arg_idx,
+    const py::handle& handle) {
+  PyObject* py_obj = handle.ptr();  // get underlying PyObject
+  if (!py_obj || py_obj == Py_None) {
+    return nullptr;
+  }
+  try {
+    return py::cast<std::shared_ptr<imperative::VarBase>>(py::handle(py_obj));
+  } catch (py::cast_error&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be Tensor, but got "
+        "%s",
+        op_type, arg_name, arg_idx, Py_TYPE(py_obj)->tp_name));
+  }
+}
+
+static inline std::vector<std::shared_ptr<imperative::VarBase>>
+CastPyHandleToVarBaseList(const std::string& op_type,
+                          const std::string& arg_name, int arg_idx,
+                          const py::handle& handle) {
+  PyObject* py_obj = handle.ptr();  // get underlying PyObject
+  if (!py_obj || py_obj == Py_None) {
+    return {};
+  }
+  std::vector<std::shared_ptr<imperative::VarBase>> result;
+  if (PyList_Check(py_obj) || PyTuple_Check(py_obj)) {
+    auto size = PyTuple_Check(py_obj) ? PyTuple_GET_SIZE(py_obj)
+                                      : PyList_GET_SIZE(py_obj);
+    for (auto i = 0; i < size; ++i) {
+      PyObject* item = PyTuple_Check(py_obj) ? PyTuple_GET_ITEM(py_obj, i)
+                                             : PyList_GET_ITEM(py_obj, i);
+      if (!item || item == Py_None) {
+        result.emplace_back(nullptr);
+        continue;
+      }
+      try {
+        result.emplace_back(
+            py::cast<std::shared_ptr<imperative::VarBase>>(py::handle(item)));
+      } catch (py::cast_error&) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument '%s' (position %d) must be list of "
+            "Tensors, but "
+            "got %s in list (item %d)",
+            op_type, arg_name, arg_idx, Py_TYPE(item)->tp_name, i));
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx, Py_TYPE(py_obj)->tp_name));
+  }
+  return result;
+}  // namespace pybind
+
+static inline void ConstructAttrMapFromPyArgs(const std::string& op_type,
+                                              int start_idx,
+                                              framework::AttributeMap* attrs,
                                               const py::args& args) {
   PADDLE_ENFORCE_EQ(
       args.size() % 2, 0,
       platform::errors::InvalidArgument(
           "The number of arguments for arributes should be even."));
   for (size_t i = 0; i < args.size(); i += 2) {
-    auto name = args[i].cast<std::string>();
-    auto value = args[i + 1].cast<framework::Attribute>();
+    std::string name;
+    framework::Attribute value;
+    try {
+      name = args[i].cast<std::string>();
+    } catch (std::exception& e) {
+      PyObject* py_obj = args[i].ptr();  // get underlying PyObject
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be str, but got "
+          "%s",
+          op_type, start_idx + i, Py_TYPE(py_obj)->tp_name));
+    }
+    try {
+      value = args[i + 1].cast<framework::Attribute>();
+    } catch (std::exception& e) {
+      PyObject* py_obj = args[i + 1].ptr();  // get underlying PyObject
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be "
+          "Attribute type (one of str, bool, int, int64, float, or list of "
+          "them), but got %s",
+          op_type, start_idx + i + 1, Py_TYPE(py_obj)->tp_name));
+    }
     (*attrs)[name] = value;
   }
 }
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 7412eede118d122b14c69ab663836c156eb740e2..256faf04ea6de5835f22113537caac49ca1dbab4 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -40,6 +40,9 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"assign", {"X"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"X", "InScale", "InAccum", "InState"}},
+    {"nll_loss", {"X", "Label", "Weight"}},
+    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
+    {"gather", {"X", "Index", "Axis"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -56,6 +59,10 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
+    {"sync_batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -75,9 +82,23 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"momentum", {"ParamOut", "VelocityOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
+    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
     {"matmul", {"Out"}},
+    {"c_broadcast", {"Out"}},
+    {"c_allreduce_sum", {"Out"}},
+    {"c_allreduce_max", {"Out"}},
+    {"c_allreduce_min", {"Out"}},
+    {"c_allreduce_prod", {"Out"}},
+    {"c_reduce_sum", {"Out"}},
+    {"c_reduce_max", {"Out"}},
+    {"c_reduce_min", {"Out"}},
+    {"c_reduce_prod", {"Out"}},
+    {"c_reduce", {"Out"}},
+    {"c_allgather", {"Out"}},
+    {"c_scatter", {"Out"}},
+    {"barrier", {"Out"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
@@ -115,8 +136,19 @@ const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
 const char* ARG_OUT_NUM = R"(%sNum)";
 const char* ARG_OUT_NUM_TYPE = R"(size_t )";
 
-const char* VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
-const char* VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+const char* IN_VAR_TYPE = R"(py::handle)";
+const char* IN_VAR_LIST_TYPE = R"(py::handle)";
+
+const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
+const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+
+const char* CAST_VAR_TEMPLATE = R"(
+  auto %s = CastPyHandleToVarBase("%s", "%s", %d, %s);)";
+
+const char* CAST_VAR_LIST_TEMPLATE = R"(
+  auto %s = CastPyHandleToVarBaseList("%s", "%s", %d, %s);)";
+
+
 const char* ARG_TEMPLATE = R"(const %s& %s)";
 
 const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)";
@@ -132,8 +164,9 @@ const char* OP_FUNCTION_TEMPLATE =
 R"(
 %s %s(%s)
 {
+  %s
   framework::AttributeMap attrs;
-  ConstructAttrMapFromPyArgs(&attrs, args);
+  ConstructAttrMapFromPyArgs("%s", %d, &attrs, args);
   {
     py::gil_scoped_release release;
     auto tracer = imperative::GetCurrentTracer();
@@ -163,6 +196,10 @@ static inline bool FindPassingOutsMap(const std::string& op_type,
   return op_passing_outs_map[op_type].count(out_name);
 }
 
+static inline std::string TempName(const std::string& name) {
+  return name + '_';
+}
+
 static std::tuple<std::vector<std::string>, std::vector<std::string>>
 GenerateOpFunctions(const std::string& module_name) {
   auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
@@ -186,16 +223,26 @@ GenerateOpFunctions(const std::string& module_name) {
     std::string ins_initializer = "{";
     std::string ins_initializer_with_null = "";
     std::string py_arg = "";
+    int arg_idx = 0;
+    int input_args_num = 0;
+    std::string ins_cast_str = "";
     for (auto& input : op_proto->inputs()) {
       auto& in_name = input.name();
       // skip those dispensable inputs, like ResidualData in conv2d
       if (input.dispensable() && !FindInsMap(op_type, in_name)) {
         continue;
       }
-      const auto in_type = input.duplicable() ? VAR_LIST_TYPE : VAR_TYPE;
-      auto input_arg = paddle::string::Sprintf(ARG_TEMPLATE, in_type, in_name);
+      const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
+      auto input_arg =
+          paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
       input_args += input_arg;
       input_args += ",";
+      input_args_num++;
+      const auto in_cast_type =
+          input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      ins_cast_str +=
+          paddle::string::Sprintf(in_cast_type, in_name, op_type, in_name,
+                                  arg_idx++, TempName(in_name));
 
       if (input.dispensable()) {
         const auto in_template = input.duplicable()
@@ -234,7 +281,8 @@ GenerateOpFunctions(const std::string& module_name) {
       if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
         continue;
       }
-      const auto out_type = output.duplicable() ? VAR_LIST_TYPE : VAR_TYPE;
+      const auto out_type =
+          output.duplicable() ? OUT_VAR_LIST_TYPE : OUT_VAR_TYPE;
       const auto return_template =
           output.duplicable() ? RETURN_LIST_TEMPLATE : RETURN_TEMPLATE;
       if (FindPassingOutsMap(op_type, out_name)) {
@@ -243,6 +291,7 @@ GenerateOpFunctions(const std::string& module_name) {
         }
         input_args += out_type;
         input_args += out_name;
+        input_args_num++;
 
         if (output.dispensable()) {
           const auto out_template =
@@ -269,6 +318,7 @@ GenerateOpFunctions(const std::string& module_name) {
           auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
           input_args += ARG_OUT_NUM_TYPE;
           input_args += out_num_str;
+          input_args_num++;
           outs_initializer += paddle::string::Sprintf(
               OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str);
         } else {
@@ -308,9 +358,9 @@ GenerateOpFunctions(const std::string& module_name) {
     // generate op funtcion body
     auto op_function_str = paddle::string::Sprintf(
         OP_FUNCTION_TEMPLATE, return_type, func_name, function_args,
-        outs_initializer, ins_initializer,
-        ins_initializer_with_null + outs_initializer_with_null, op_type,
-        return_str);
+        ins_cast_str, op_type, input_args_num, outs_initializer,
+        ins_initializer, ins_initializer_with_null + outs_initializer_with_null,
+        op_type, return_str);
 
     // generate pybind item
     auto bind_function_str = paddle::string::Sprintf(
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d6a8b226637c15b76d869a23daf7168a7ac51211..4b8f7c853ceaf2148722a9c65f38e0ec3d9f4df5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -64,7 +64,9 @@ limitations under the License. */
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
 #include "paddle/fluid/pybind/imperative.h"
@@ -89,6 +91,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/fluid/pybind/communicator_py.h"
 #endif
@@ -117,6 +123,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithXPU() {
+#ifndef PADDLE_WITH_XPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -341,6 +355,10 @@ PYBIND11_MODULE(core_noavx, m) {
 
   m.def("set_num_threads", &platform::SetNumThreads);
 
+#ifdef PADDLE_WITH_CUDA
+  m.def("cudnn_version", &platform::CudnnVersion);
+#endif
+
   m.def("from_dlpack", [](py::capsule *dltensor) {
     DLManagedTensor *dmt = reinterpret_cast<DLManagedTensor *>(
         PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
@@ -466,6 +484,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_float",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
@@ -478,6 +500,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("_alloc_int",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
       .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
@@ -495,6 +521,11 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
+      .def("_mutable_data",
+           [](Tensor &self, paddle::platform::XPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("_mutable_data",
            [](Tensor &self, paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
@@ -508,6 +539,8 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_clear", &Tensor::clear)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
@@ -517,7 +550,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1070,7 +1103,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("find_var", &Scope::FindVar, py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found.
+           its parent scope. Return None if not found. 
 
            Args:
                name (str): the variable name.
@@ -1213,8 +1246,6 @@ All parameter, weight, gradient are variables in Paddle.
         []() { return std::string(framework::kEmptyVarName); });
   m.def("grad_var_suffix",
         []() { return std::string(framework::kGradVarSuffix); });
-  m.def("loaded_var_suffix",
-        []() { return std::string(framework::kLoadedVarSuffix); });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -1227,6 +1258,18 @@ All parameter, weight, gradient are variables in Paddle.
                       -> paddle::platform::DeviceContext* {
                     return new paddle::platform::CPUDeviceContext();
                   })
+      .def_static("create",
+                  [](paddle::platform::XPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_XPU
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use XPUPlace in CPU/GPU version, "
+                 "Please recompile or reinstall Paddle with XPU support."));
+#else
+                    return new paddle::platform::XPUDeviceContext(place);
+#endif
+                  })
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
@@ -1321,14 +1364,75 @@ All parameter, weight, gradient are variables in Paddle.
              std::exit(-1);
 #endif
            })
+#ifdef PADDLE_WITH_CUDA
+      .def("get_device_id",
+           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
+      .def("_get_device_id",
+           [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
+#endif
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
+  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
+    **Note**:
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          xpu_place = fluid.XPUPlace(0)
+        )DOC")
+      .def("__init__",
+           [](platform::XPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_XPU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid XPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
+               if (platform::GetXPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetXPUDeviceCount(),
+                     platform::GetXPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::XPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use XPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use XPU, please try to install XPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change XPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::XPUPlace &>);
+
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1343,6 +1447,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1377,6 +1482,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1389,11 +1496,14 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
       .def("is_cpu_place",
            [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_xpu_place",
+           [](platform::Place &self) { return platform::is_xpu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1402,12 +1512,20 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::CUDAPlace, self).device;
            })
+      .def("xpu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::XPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
+             self = xpu_place;
+           })
       .def("set_place",
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
@@ -1435,6 +1553,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::XPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1535,6 +1656,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
@@ -2490,11 +2612,15 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #ifdef PADDLE_WITH_NCCL
   BindNCCLWrapper(&m);
+#endif
+#ifdef PADDLE_WITH_GLOO
+  BindGlooContext(&m);
 #endif
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
   BindDataset(&m);
+  BindGenerator(&m);
 #ifdef PADDLE_WITH_CRYPTO
   BindCrypto(&m);
 #endif
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 0dd30e562b66847551e5f27b45042fb077fc7bc7..856c5aac5eb38c7da82a956c5823d1a19be5c8d7 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -125,11 +125,12 @@ class MultiDeviceFeedReader {
       const std::vector<framework::proto::VarType::Type> &dtypes,
       const std::vector<bool> &need_check_feed,
       const std::vector<platform::Place> &dst_places, bool use_double_buffer,
-      bool drop_last)
+      bool drop_last, bool pin_memory = false)
       : queue_(queue),
         names_(names),
         pool_(new ::ThreadPool(dst_places.size())),
-        drop_last_(drop_last) {
+        drop_last_(drop_last),
+        pin_memory_(pin_memory) {
     std::vector<framework::DDim> dims;
     for (auto &shape : shapes) {
       dims.push_back(framework::make_ddim(shape));
@@ -157,7 +158,7 @@ class MultiDeviceFeedReader {
         VLOG(10) << "Creating " << i << "-th BufferedReader";
         holder->Reset(
             framework::MakeDecoratedReader<operators::reader::BufferedReader>(
-                reader, p, 2));
+                reader, p, 2, pin_memory_));
       } else {
         if (platform::is_gpu_place(p)) {
           PADDLE_THROW(platform::errors::PermissionDenied(
@@ -322,6 +323,7 @@ class MultiDeviceFeedReader {
 
   std::vector<std::vector<framework::LoDTensor>> ret_;
   bool drop_last_;
+  bool pin_memory_;
 };
 
 template <typename QueueType>
@@ -445,10 +447,10 @@ void BindReader(py::module *module) {
            const std::vector<framework::proto::VarType::Type> &dtypes,
            const std::vector<bool> &need_check_feed,
            const std::vector<platform::Place> &dst_places,
-           bool use_double_buffer, bool drop_last) {
+           bool use_double_buffer, bool drop_last, bool pin_memory) {
           return new MultiDeviceFeedReader<reader::LoDTensorBlockingQueue>(
               queue, names, shapes, dtypes, need_check_feed, dst_places,
-              use_double_buffer, drop_last);
+              use_double_buffer, drop_last, pin_memory);
         },
         py::return_value_policy::take_ownership);
 
@@ -461,12 +463,12 @@ void BindReader(py::module *module) {
          const std::vector<framework::proto::VarType::Type> &dtypes,
          const std::vector<bool> &need_check_feed,
          const std::vector<platform::Place> &dst_places, bool use_double_buffer,
-         bool drop_last) {
+         bool drop_last, bool pin_memory) {
         queue->SetDeviceCount(dst_places.size());
         return new MultiDeviceFeedReader<
             reader::OrderedMultiDeviceLoDTensorBlockingQueue>(
             queue, names, shapes, dtypes, need_check_feed, dst_places,
-            use_double_buffer, drop_last);
+            use_double_buffer, drop_last, pin_memory);
       },
       py::return_value_policy::take_ownership);
 }
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ba79c4b44374eb9b50ad4982a2eacd664fc6e75e..4377a8c2cef5aab7a200955cd25830d448014817 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -145,8 +146,14 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
   T b = static_cast<T>(0);
   if (platform::is_cpu_place(self.place())) {
     b = self.data<T>()[offset];
+  } else if (platform::is_xpu_place(self.place())) {
+#ifdef PADDLE_WITH_XPU
+    const T *a = self.data<T>();
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self.place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
     const T *a = self.data<T>();
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
@@ -163,8 +170,14 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
                         "The offset exceeds the size of tensor."));
   if (platform::is_cpu_place(self->place())) {
     self->mutable_data<T>(self->place())[offset] = elem;
+  } else if (platform::is_xpu_place(self->place())) {
+#ifdef PADDLE_WITH_XPU
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self->place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
@@ -194,6 +207,16 @@ void SetTensorFromPyArrayT(
       auto dst = self->mutable_data<T>(place);
       std::memcpy(dst, array.data(), array.nbytes());
     }
+  } else if (paddle::platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto dst = self->mutable_data<T>(place);
+    xpu_memcpy(dst, array.data(), array.nbytes(),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
   } else {
 #ifdef PADDLE_WITH_CUDA
     auto dst = self->mutable_data<T>(place);
@@ -211,7 +234,7 @@ void SetTensorFromPyArrayT(
     }
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use CUDAPlace in CPU only version, "
+        "Cannot use CUDAPlace or CUDAPinnedPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
 #endif
   }
@@ -354,8 +377,13 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
   if (platform::is_cpu_place(place)) {
     output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place),
                          self.type());
-#ifdef PADDLE_WITH_CUDA
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
+                         self.type());
+#endif
   } else {
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
                            self.type());
@@ -516,6 +544,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     return py::array();
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
+  bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -534,11 +563,11 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor) {
     if (!need_deep_copy) {
-      return py::array(py::buffer_info(
-          const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
-          static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
+      auto base = py::cast(std::move(tensor));
+      return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
+                       const_cast<void *>(tensor_buf_ptr), base);
     } else {
       py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
       PADDLE_ENFORCE_EQ(
@@ -557,28 +586,54 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
                            copy_bytes);
       return py_arr;
     }
-  }
-
+  } else if (is_xpu_tensor) {
+#ifdef PADDLE_WITH_XPU
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place());
+    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
+                         tensor_buf_ptr, copy_bytes);
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (is_gpu_tensor) {
 #ifdef PADDLE_WITH_CUDA
-  py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-  PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray is not writable, in which case memory leak "
-                        "or double free would occur"));
-  PADDLE_ENFORCE_EQ(py_arr.owndata(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray does not own data, in which case  memory leak "
-                        "or double free would occur"));
-
-  size_t copy_bytes = sizeof_dtype * numel;
-  paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
-                                  copy_bytes, cudaMemcpyDeviceToHost);
-  return py_arr;
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
+                                    copy_bytes, cudaMemcpyDeviceToHost);
+    return py_arr;
 #else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "Cannot use CUDAPlace in CPU only version, "
-      "Please recompile or reinstall Paddle with CUDA support."));
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CUDAPlace in CPU only version, "
+        "Please recompile or reinstall Paddle with CUDA support."));
 #endif
+  }
+  PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
+  return py::array();
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 04870f87c40dd305304579a454cb618bf1446e39..1f88eb2109aa23b6b60104451908b0a70c41c898 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,6 +29,8 @@ function(train_test TARGET_NAME)
                 PROPERTIES DEPENDS test_${TARGET_NAME})
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES LABELS "RUN_TYPE=DIST")
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES TIMEOUT 150)
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 39db5a601d3d46c106a574870f02434bd4bd5cd1..d7a86b653bec44c260a845d454c771ec4440993b 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
 ## Docker Images
 
@@ -155,21 +154,6 @@ docker push
 kubectl ...
 ```
 
-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
-- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-./paddle/scripts/paddle_docker_build.sh html
-```
-
-- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```
 
 ## More Options
 
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 0c96906afb917c2544c9fe4e2172033e84102e4f..c84574b21d883b24e1f89c59c3a724aae6621479 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -1,21 +1,91 @@
-@ECHO OFF
+@ECHO ON
 SETLOCAL
 
+rem Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+rem
+rem Licensed under the Apache License, Version 2.0 (the "License");
+rem you may not use this file except in compliance with the License.
+rem You may obtain a copy of the License at
+rem
+rem     http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+
+rem =================================================
+rem       Paddle CI Task On Windows Platform
+rem =================================================
+
 set work_dir=%cd%
+if exist build rmdir build /s/q
+mkdir build
+cd /d build
+
+rem ------initialize the virtual environment------
+if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
+set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
+
+rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
+rem Now use system python environment temporarily
+rem set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+rem %PYTHON_EXECUTABLE% -m pip install virtualenv
+rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
+rem call paddle_winci\Scripts\activate.bat
+
+rem ------pre install requirement----------
+where python
+where pip
+pip install --upgrade pip --user
+pip install wheel --user
+pip install gym --user
+pip install -U -r %work_dir%\python\requirements.txt --user
+if %ERRORLEVEL% NEQ 0 (
+    call paddle_winci\Scripts\deactivate.bat 2>NUL
+    echo pip install requirements.txt failed!
+    exit /b 7
+)
+
+rem ------initialize common variable------
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
-if not defined PYTHON_ROOT set PYTHON_ROOT=c:\Python27
-if not defined WITH_MKL set WITH_MKL=ON
-if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_AVX set WITH_AVX=ON
-if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
-if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=d:/.cache/inference_demo
-if not defined THIRD_PARTY_PATH set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-dir d:\.cache
+if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+
+rem ------set cache third_party------
+set cache_dir=%work_dir%\..\cache
+dir %cache_dir%
+set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
+
+if not exist %cache_dir%\tools (
+    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
+    if %ERRORLEVEL% NEQ 0 exit /b %ERRORLEVEL%
+)
+
+if "%WITH_TPCACHE%"=="OFF" (
+    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
+    goto :CASE_%1
+)
+
+echo set -ex > cache.sh
+echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
+echo echo ${md5_content}^>md5.txt >> cache.sh
+
+%cache_dir%\tools\busybox64.exe cat cache.sh
+%cache_dir%\tools\busybox64.exe bash cache.sh
+
+set /p md5=< md5.txt
+if "%WITH_GPU%"=="ON" (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5%
+) else (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5%
+)
 
 goto :CASE_%1
 
@@ -26,6 +96,8 @@ echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
 exit /b 1
 
 :CASE_wincheck_mkl
+set WITH_MKL=ON
+set WITH_GPU=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -35,24 +107,32 @@ call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 :CASE_wincheck_openblas
+set WITH_MKL=OFF
+set WITH_GPU=ON
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 goto:success
 
+rem "Other configurations are added here"
+rem :CASE_wincheck_others
+rem call ...
+
+
 rem ---------------------------------------------------------------------------------------------
 :cmake
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-mkdir build
-cd /d build
-cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
 goto:eof
 
 :cmake_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Cmake failed, will exit!
+exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :build
@@ -60,38 +140,42 @@ echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
-set build_times=1
 
+set build_times=1
 :build_tp
-echo BUILD THIRD_PARTY %build_times%
+echo Build third_party for %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
-echo BUILD THIRD_PARTY RESULT %ERRORLEVEL%
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
     if %build_times% GTR 3 (
-        exit /b 1
+        exit /b 7
     ) else (
+        echo Build third_party failed, will retry!
         goto :build_tp
     )
 )
+echo Build third_party successfully!
 
 set build_times=1
 :build_paddle
-echo BUILD PADDLE %build_times%
-msbuild /m /p:Configuration=Release /verbosity:quiet paddle.sln
-echo BUILD PADDLE RESULT %ERRORLEVEL%
+echo Build Paddle for %build_times% time:
+msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
-        exit /b 1
+        exit /b 7
     ) else (
+        echo Build Paddle failed, will retry!
         goto :build_paddle
     )
 )
+echo Build Paddle successfully!
 goto:eof
 
 :build_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Build Paddle failed, will exit!
+exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :test_whl_pacakage
@@ -100,49 +184,61 @@ echo    Step 3. Test pip install whl package ...
 echo    ========================================
 dir /s /b python\dist\*.whl > whl_file.txt
 set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
-%PYTHON_EXECUTABLE% -m pip install -U %PADDLE_WHL_FILE_WIN%
-echo import paddle.fluid;print(paddle.__version__) > test_whl.py
-%PYTHON_EXECUTABLE% test_whl.py
+
+pip uninstall -y paddlepaddle
+pip uninstall -y paddlepaddle-gpu
+pip install -U %PADDLE_WHL_FILE_WIN% --user
+if %ERRORLEVEL% NEQ 0 (
+    call paddle_winci\Scripts\deactivate.bat 2>NUL
+    echo pip install whl package failed!
+    exit /b 3
+)
+
+python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
 
 :test_whl_pacakage_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Test import paddle failed, will exit!
+exit /b 3
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
 echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
-%PYTHON_EXECUTABLE% -m pip install --upgrade pip
-dir %work_dir%\build\third_party\install\openblas\lib
-dir %work_dir%\build\third_party\install\openblas\bin
-dir %work_dir%\build\third_party\install\zlib\bin
-dir %work_dir%\build\third_party\install\mklml\lib
-dir %work_dir%\build\third_party\install\mkldnn\bin
-dir %work_dir%\build\third_party\install\warpctc\bin
-
-set PATH=%work_dir%\build\third_party\install\openblas\lib;%work_dir%\build\third_party\install\openblas\bin;%work_dir%\build\third_party\install\zlib\bin;%work_dir%\build\third_party\install\mklml\lib;%work_dir%\build\third_party\install\mkldnn\bin;%work_dir%\build\third_party\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 10
+
+dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
+dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
+dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
+dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
+dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
+dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
+
+set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+ctest.exe --output-on-failure -C Release -j 8
 goto:eof
 
 :unit_test_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Running unit tests failed, will exit!
+exit /b 8
 
 rem ---------------------------------------------------------------------------------------------
 :test_inference
 echo    ========================================
 echo    Step 5. Testing fluid library for inference ...
 echo    ========================================
-if NOT EXIST "d:\.cache\tools" (
-  git clone https://github.com/zhouwei25/tools.git d:\.cache\tools
-)
+
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
 
-d:\.cache\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% d:/.cache/inference_demo
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
 goto:eof
 
 :test_inference_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Testing fluid library for inference failed!
+exit /b 5
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
@@ -164,7 +260,7 @@ echo     ============================================ >>  check_change_of_unitte
 echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
 echo     ============================================ >>  check_change_of_unittest.sh
 echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/../paddle/fluid/UNITTEST_PR.spec>>  check_change_of_unittest.sh
+echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
 echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
@@ -179,16 +275,16 @@ echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>>  check_c
 echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
-echo git checkout -b test_pr -t upstream/$BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE:\=\\% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
+echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/../paddle/fluid/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
+echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/../paddle/fluid/UNITTEST_DEV.spec $(pwd)/../paddle/fluid/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
+echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
 echo if [ "$unittest_spec_diff" != "" ]; then>>  check_change_of_unittest.sh
 echo     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420>>  check_change_of_unittest.sh
 echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
@@ -210,12 +306,13 @@ echo     else>>  check_change_of_unittest.sh
 echo          exit 1 >>  check_change_of_unittest.sh
 echo     fi>>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
-echo git checkout origin_pr >>  check_change_of_unittest.sh
-d:\.cache\tools\busybox64.exe bash check_change_of_unittest.sh
+echo git checkout -f origin_pr >>  check_change_of_unittest.sh
+%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh
 goto:eof
 
 :check_change_of_unittest_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+exit /b 1
 
 
 rem ---------------------------------------------------------------------------------------------
@@ -233,6 +330,10 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
+taskkill /f /im %cd%\paddle\fluid\pybind\Release\op_function_generator.exe  2>NUL
+taskkill /f /im python.exe  2>NUL
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8684851283f21b0384bc2aa95808c2594726f122..a77d605eb6c26b02c38a58195d1f8f1e84a3dc20 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -195,6 +195,12 @@ function cmake_base() {
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
     grpc_flag=${WITH_GRPC:-${distibuted_flag}}
 
+    if [ "$SYSTEM" == "Darwin" ]; then
+        gloo_flag="OFF"
+    else
+        gloo_flag=${distibuted_flag}
+    fi
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -219,6 +225,7 @@ function cmake_base() {
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
         -DWITH_GRPC=${grpc_flag}
+	    -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
         -DLITE_GIT_TAG=develop
     ========================================
@@ -249,6 +256,7 @@ EOF
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
         -DWITH_GRPC=${grpc_flag} \
+	    -DWITH_GLOO=${gloo_flag} \
         -DLITE_GIT_TAG=develop \
         -DWITH_LITE=${WITH_LITE:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
@@ -521,13 +529,16 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2
+        ctest --output-on-failure -j $2;mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
         # Recovery proxy to avoid failure in later steps
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
+        if [ "$mactest_error" != 0 ];then
+            exit 8;
+        fi
     fi
 }
 
@@ -562,6 +573,7 @@ function generate_upstream_develop_api_spec() {
 }
 
 function generate_api_spec() {
+    set -e
     spec_kind=$2
     if [ "$spec_kind" != "PR" ] && [ "$spec_kind" != "DEV" ]; then
         echo "Not supported $2"
@@ -572,7 +584,8 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
 
@@ -686,6 +699,13 @@ function assert_api_spec_approvals() {
     fi
 }
 
+function assert_file_diff_approvals() {
+    /bin/bash ${PADDLE_ROOT}/tools/check_file_diff_approvals.sh;file_approval_error=$?
+    if [ "$file_approval_error" != 0 ];then
+       exit 6
+    fi
+}
+
 
 function check_coverage() {
     /bin/bash ${PADDLE_ROOT}/tools/coverage/paddle_coverage.sh
@@ -860,6 +880,7 @@ set +x
         multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
+        is_nightly=''             # indicate whether the case will only run at night
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
@@ -869,12 +890,19 @@ set +x
                     # Any test case with LABELS property would be parse here
                     # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
                     # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                    # RUN_TYPE=NIGHTLY or RUN_TYPE=DIST:NIGHTLY or RUN_TYPE=EXCLUSIVE:NIGHTLY means the case will ONLY run at night
                     read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
                     read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                    read is_nightly <<< $(echo "$line"|grep -oEi "RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY")
                     continue
                 fi
                 read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
+                if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
+                    echo $testcase" will only run at night."
+                    continue
+                fi
+
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
                   read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
@@ -893,7 +921,7 @@ set +x
                         multiple_card_tests="$multiple_card_tests|^$testcase$"
                     fi
                 else
-                    if [[ "${#single_card_tests}" -gt 3000 ]];then
+                    if [[ "${#single_card_tests}" -gt 10000 ]];then
                         if [[ "$single_card_tests_1" == "" ]]; then 
                             single_card_tests_1="^$testcase$"
                         else
@@ -910,6 +938,7 @@ set +x
                 fi
                 is_exclusive=''
                 is_multicard=''
+                is_nightly=''
                 matchstr=''
                 testcase=''
         done <<< "$test_cases";
@@ -919,17 +948,96 @@ set +x
         card_test "$multiple_card_tests" 2  # run cases with two GPUs
         card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
         collect_failed_tests
-        if [ -n "${failed_test_lists}" ];then
-            failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-            echo "========================================"
-            echo "Summary Failed Tests... "
-            echo "========================================"
-            echo "The following tests FAILED: "
-            echo "${failed_test_lists_ult}"
-        fi
         rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_time_array=('first' 'second' 'third')
+        if [ -n "$failed_test_lists" ];then
+            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                do
+                    
+                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                    failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                    echo "========================================="
+                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                    echo "========================================="
+                    echo "The following unittest will be re-run:"
+                    echo "${failed_test_lists_ult}"
+                        
+                    for line in ${retry_unittests[@]} ;
+                        do
+
+                            one_card_tests=$single_card_tests'|'$single_card_tests_1
+
+                            read tmp_one_tmp <<< "$( echo $one_card_tests | grep -oEi $line )"
+                            read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )"
+                            read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )"
+
+                            if [[ "$tmp_one_tmp" != ""  ]]; then
+                                if [[ "$one_card_retry" == "" ]]; then
+                                    one_card_retry="^$line$"
+                                else
+                                    one_card_retry="$one_card_retry|^$line$"
+                                fi
+                            elif [[ "$tmp_mul_tmp" != "" ]]; then
+                                if [[ "$multiple_card_retry" == "" ]]; then
+                                    multiple_card_retry="^$line$"
+                                else
+                                    multiple_card_retry="$multiple_card_retry|^$line$"
+                                fi
+                            else
+                                if [[ "$exclusive_retry" == "" ]];then
+                                    exclusive_retry="^$line$"
+                                else
+                                    exclusive_retry="$exclusive_retry|^$line$"
+                                fi
+                            fi
+
+                        done
+
+                    if [[ "$one_card_retry" != "" ]]; then
+                        card_test "$one_card_retry" 1
+                    fi
+
+                    if [[ "$multiple_card_retry" != "" ]]; then
+                        card_test "$multiple_card_retry" 2
+                    fi
+
+                    if [[ "$exclusive_retry" != "" ]]; then
+                        card_test "$exclusive_retry"
+                    fi
+                    
+                    exec_times=$[$exec_times+1]
+                    failed_test_lists=''
+                    collect_failed_tests
+                    rm -f $tmp_dir/*
+                    one_card_retry=''
+                    multiple_card_retry=''
+                    exclusive_retry=''
+                    retry_unittests=''
+                done
+        fi
+
+
+       
         if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
+            if [[ "$failed_test_lists" == "" ]]; then
+                echo "========================================"
+                echo "There are failed tests, which have been successful after re-run:"
+                echo "========================================"
+                echo "The following tests have been re-ran:"
+                echo "${retry_unittests_record}"
+            else
+                failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                echo "========================================"
+                echo "Summary Failed Tests... "
+                echo "========================================"
+                echo "The following tests FAILED: "
+                echo "${failed_test_lists_ult}"
+                exit 8;
+            fi
         fi
 set -ex
     fi
@@ -1009,22 +1117,6 @@ EOF
       esac
 }
 
-function gen_html() {
-    cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-    	-b ${PADDLE_ROOT}/build \
-    	-a \
-    	-o $WOBOQ_OUT \
-    	-p paddle:${PADDLE_ROOT}
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-}
 
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
@@ -1260,9 +1352,13 @@ EOF
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
              ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
              ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+    EXIT_CODE=$?
     fluid_endTime_s=`date +%s`
     echo "test_fluid_lib Total Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"          
     ./clean.sh
+    if [[ "$EXIT_CODE" != "0" ]]; then
+        exit 8;
+    fi
 }
 
 function test_fluid_lib_train() {
@@ -1274,9 +1370,13 @@ EOF
     fluid_train_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/train/demo
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON}
+    EXIT_CODE=$?
     fluid_train_endTime_s=`date +%s`
     echo "test_fluid_lib_train Total Time: $[ $fluid_train_endTime_s - $fluid_train_startTime_s ]s"
     ./clean.sh
+    if [[ "$EXIT_CODE" != "0" ]]; then
+        exit 8;
+    fi
 }
 
 function build_document_preview() {
@@ -1339,9 +1439,6 @@ function main() {
       gen_doc_lib)
         gen_doc_lib $2
         ;;
-      html)
-        gen_html
-        ;;
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
@@ -1380,7 +1477,7 @@ function main() {
       test_inference)
         gen_fluid_lib ${parallel_number}
         test_fluid_lib
-        test_fluid_lib_train
+        #test_fluid_lib_train
         ;;
       test_train)
         gen_fluid_lib ${parallel_number}
@@ -1389,6 +1486,9 @@ function main() {
       assert_api_approvals)
         assert_api_spec_approvals
         ;;
+      assert_file_approvals)
+        assert_file_diff_approvals
+        ;; 
       maccheck)
         cmake_gen_and_build_mac ${PYTHON_ABI:-""}
         run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 59dfc5c9d0311342fc72d8400a3abddd3f6d778b..779a6842ebb03e2afcdb7718f77eb9b0d2c09a83 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -75,14 +75,12 @@ IF(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ELSE(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()
 
@@ -93,6 +91,7 @@ set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (WITH_TESTING)
   add_subdirectory(paddle/reader/tests)
   add_subdirectory(paddle/dataset/tests)
+  add_subdirectory(paddle/tests)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
old mode 100644
new mode 100755
index 6cc986c61e1db1990cde9598cccd5ee307b31df5..c22eee3df6f294d0e364b734c9472a0ef62270e4
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -31,27 +31,33 @@ import paddle.reader
 import paddle.dataset
 import paddle.batch
 batch = batch.batch
+from .fluid import monkey_patch_variable
+from .fluid.dygraph import monkey_patch_math_varbase
+monkey_patch_variable()
+monkey_patch_math_varbase()
+import paddle.framework
+from .framework import VarBase as Tensor
+from .framework import ComplexVariable as ComplexTensor
 import paddle.compat
 import paddle.distributed
 import paddle.sysconfig
 import paddle.tensor
+import paddle.distribution
 import paddle.nn
-import paddle.fleet
-import paddle.framework
-import paddle.imperative
+import paddle.distributed.fleet
 import paddle.optimizer
 import paddle.metric
+import paddle.device
 import paddle.incubate.complex as complex
 
 # TODO: define alias in tensor and framework directory
 
 from .tensor.random import randperm
+from .tensor.random import bernoulli
 
 from .tensor.attribute import rank  #DEFINE_ALIAS
 from .tensor.attribute import shape  #DEFINE_ALIAS
-from .tensor.creation import create_tensor  #DEFINE_ALIAS
-# from .tensor.creation import create_lod_tensor        #DEFINE_ALIAS
-# from .tensor.creation import create_random_int_lodtensor        #DEFINE_ALIAS
+from .tensor.creation import to_tensor  #DEFINE_ALIAS
 from .tensor.creation import crop_tensor  #DEFINE_ALIAS
 from .tensor.creation import diag  #DEFINE_ALIAS
 from .tensor.creation import eye  #DEFINE_ALIAS
@@ -69,8 +75,6 @@ from .tensor.creation import full_like  #DEFINE_ALIAS
 from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
-from .tensor.io import save  #DEFINE_ALIAS
-from .tensor.io import load  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
@@ -87,7 +91,7 @@ from .tensor.logic import equal  #DEFINE_ALIAS
 from .tensor.logic import greater_equal  #DEFINE_ALIAS
 from .tensor.logic import greater_than  #DEFINE_ALIAS
 from .tensor.logic import is_empty  #DEFINE_ALIAS
-from .tensor.logic import isfinite  #DEFINE_ALIAS
+#from .tensor.logic import isfinite  #DEFINE_ALIAS
 from .tensor.logic import less_equal  #DEFINE_ALIAS
 from .tensor.logic import less_than  #DEFINE_ALIAS
 from .tensor.logic import logical_and  #DEFINE_ALIAS
@@ -103,7 +107,9 @@ from .tensor.logic import equal_all  #DEFINE_ALIAS
 from .tensor.manipulation import cast  #DEFINE_ALIAS
 from .tensor.manipulation import concat  #DEFINE_ALIAS
 from .tensor.manipulation import expand  #DEFINE_ALIAS
+from .tensor.manipulation import broadcast_to  #DEFINE_ALIAS
 from .tensor.manipulation import expand_as  #DEFINE_ALIAS
+from .tensor.manipulation import tile  #DEFINE_ALIAS
 from .tensor.manipulation import flatten  #DEFINE_ALIAS
 from .tensor.manipulation import gather  #DEFINE_ALIAS
 from .tensor.manipulation import gather_nd  #DEFINE_ALIAS
@@ -126,6 +132,7 @@ from .tensor.manipulation import unstack  #DEFINE_ALIAS
 from .tensor.manipulation import flip  #DEFINE_ALIAS
 from .tensor.manipulation import unbind  #DEFINE_ALIAS
 from .tensor.manipulation import roll  #DEFINE_ALIAS
+from .tensor.manipulation import chunk  #DEFINE_ALIAS
 from .tensor.math import abs  #DEFINE_ALIAS
 from .tensor.math import acos  #DEFINE_ALIAS
 from .tensor.math import asin  #DEFINE_ALIAS
@@ -137,8 +144,6 @@ from .tensor.math import cumsum  #DEFINE_ALIAS
 from .tensor.math import elementwise_add  #DEFINE_ALIAS
 from .tensor.math import elementwise_div  #DEFINE_ALIAS
 from .tensor.math import elementwise_floordiv  #DEFINE_ALIAS
-from .tensor.math import elementwise_max  #DEFINE_ALIAS
-from .tensor.math import elementwise_min  #DEFINE_ALIAS
 from .tensor.math import elementwise_mod  #DEFINE_ALIAS
 from .tensor.math import elementwise_pow  #DEFINE_ALIAS
 from .tensor.math import elementwise_sub  #DEFINE_ALIAS
@@ -167,9 +172,15 @@ from .tensor.math import sums  #DEFINE_ALIAS
 from .tensor.math import tanh  #DEFINE_ALIAS
 from .tensor.math import elementwise_sum  #DEFINE_ALIAS
 from .tensor.math import max  #DEFINE_ALIAS
+from .tensor.math import maximum  #DEFINE_ALIAS
 from .tensor.math import min  #DEFINE_ALIAS
+from .tensor.math import minimum  #DEFINE_ALIAS
 from .tensor.math import mm  #DEFINE_ALIAS
-from .tensor.math import div  #DEFINE_ALIAS
+from .tensor.math import divide  #DEFINE_ALIAS
+from .tensor.math import floor_divide  #DEFINE_ALIAS
+from .tensor.math import remainder  #DEFINE_ALIAS
+from .tensor.math import mod  #DEFINE_ALIAS
+from .tensor.math import floor_mod  #DEFINE_ALIAS
 from .tensor.math import multiply  #DEFINE_ALIAS
 from .tensor.math import add  #DEFINE_ALIAS
 from .tensor.math import atan  #DEFINE_ALIAS
@@ -179,11 +190,16 @@ from .tensor.math import log1p  #DEFINE_ALIAS
 from .tensor.math import erf  #DEFINE_ALIAS
 from .tensor.math import addcmul  #DEFINE_ALIAS
 from .tensor.math import addmm  #DEFINE_ALIAS
-from .tensor.math import clamp  #DEFINE_ALIAS
+from .tensor.math import clip  #DEFINE_ALIAS
 from .tensor.math import trace  #DEFINE_ALIAS
 from .tensor.math import kron  #DEFINE_ALIAS
-# from .tensor.random import gaussin        #DEFINE_ALIAS
-# from .tensor.random import uniform        #DEFINE_ALIAS
+from .tensor.math import isfinite  #DEFINE_ALIAS
+from .tensor.math import isinf  #DEFINE_ALIAS
+from .tensor.math import isnan  #DEFINE_ALIAS
+from .tensor.math import prod  #DEFINE_ALIAS
+from .tensor.random import standard_normal
+from .tensor.random import normal
+from .tensor.random import uniform  #DEFINE_ALIAS
 from .tensor.random import shuffle  #DEFINE_ALIAS
 from .tensor.random import randn  #DEFINE_ALIAS
 from .tensor.random import rand  #DEFINE_ALIAS
@@ -194,49 +210,60 @@ from .tensor.search import argmin  #DEFINE_ALIAS
 from .tensor.search import argsort  #DEFINE_ALIAS
 from .tensor.search import has_inf  #DEFINE_ALIAS
 from .tensor.search import has_nan  #DEFINE_ALIAS
-# from .tensor.search import masked_select        #DEFINE_ALIAS
+from .tensor.search import masked_select  #DEFINE_ALIAS
 from .tensor.search import topk  #DEFINE_ALIAS
 from .tensor.search import where  #DEFINE_ALIAS
 from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 from .framework.random import manual_seed  #DEFINE_ALIAS
-from .framework import append_backward  #DEFINE_ALIAS
-from .framework import gradients  #DEFINE_ALIAS
-from .framework import Executor  #DEFINE_ALIAS
-from .framework import global_scope  #DEFINE_ALIAS
-from .framework import scope_guard  #DEFINE_ALIAS
-from .framework import BuildStrategy  #DEFINE_ALIAS
-from .framework import CompiledProgram  #DEFINE_ALIAS
-from .framework import default_main_program  #DEFINE_ALIAS
-from .framework import default_startup_program  #DEFINE_ALIAS
+from .framework import Variable  #DEFINE_ALIAS
+from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_global_var  #DEFINE_ALIAS
 from .framework import create_parameter  #DEFINE_ALIAS
-from .framework import Print  #DEFINE_ALIAS
-from .framework import py_func  #DEFINE_ALIAS
-from .framework import ExecutionStrategy  #DEFINE_ALIAS
-from .framework import name_scope  #DEFINE_ALIAS
-from .framework import ParallelExecutor  #DEFINE_ALIAS
-from .framework import ParamAttr  #DEFINE_ALIAS
-from .framework import Program  #DEFINE_ALIAS
-from .framework import program_guard  #DEFINE_ALIAS
-from .framework import Variable  #DEFINE_ALIAS
-from .framework import WeightNormParamAttr  #DEFINE_ALIAS
 from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
+
+from .framework import to_variable  #DEFINE_ALIAS
+from .framework import grad  #DEFINE_ALIAS
+from .framework import no_grad  #DEFINE_ALIAS
+from .framework import save  #DEFINE_ALIAS
+from .framework import load  #DEFINE_ALIAS
+from .framework import prepare_context  #DEFINE_ALIAS
+from .framework import ParallelEnv  #DEFINE_ALIAS
+from .framework import DataParallel  #DEFINE_ALIAS
+
+from .framework import NoamDecay  #DEFINE_ALIAS
+from .framework import PiecewiseDecay  #DEFINE_ALIAS
+from .framework import NaturalExpDecay  #DEFINE_ALIAS
+from .framework import ExponentialDecay  #DEFINE_ALIAS
+from .framework import InverseTimeDecay  #DEFINE_ALIAS
+from .framework import PolynomialDecay  #DEFINE_ALIAS
+from .framework import CosineDecay  #DEFINE_ALIAS
+from .framework import set_default_dtype  #DEFINE_ALIAS
+from .framework import get_default_dtype  #DEFINE_ALIAS
+
 from .tensor.search import index_sample  #DEFINE_ALIAS
 from .tensor.stat import mean  #DEFINE_ALIAS
 from .tensor.stat import reduce_mean  #DEFINE_ALIAS
 from .tensor.stat import std  #DEFINE_ALIAS
 from .tensor.stat import var  #DEFINE_ALIAS
 from .fluid.data import data
+from .tensor.stat import numel  #DEFINE_ALIAS
+from .device import get_cudnn_version
+from .device import set_device
+from .device import get_device
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
 
 from . import incubate
 from .incubate import hapi
-from .fluid.dygraph.base import enable_dygraph as enable_imperative  #DEFINE_ALIAS
-from .fluid.dygraph.base import disable_dygraph as disable_imperative  #DEFINE_ALIAS
-from .fluid.framework import in_dygraph_mode as in_imperative_mode  #DEFINE_ALIAS
+from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
+from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
+from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
+from .fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+
+from . import jit
+from . import static
diff --git a/python/paddle/declarative/__init__.py b/python/paddle/declarative/__init__.py
deleted file mode 100644
index 0f28cc7f424d5f77f9080dae89f1ec5fa6adb760..0000000000000000000000000000000000000000
--- a/python/paddle/declarative/__init__.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    'fc',
-    'batch_norm',
-    'embedding',
-    'bilinear_tensor_product'
-    'conv2d'
-    'conv2d_transpose'
-    'conv3d'
-    'conv3d_transpose'
-    'create_parameter'
-    'crf_decoding'
-    'data_norm'
-    'deformable_conv'
-    'group_norm'
-    'hsigmoid'
-    'instance_norm'
-    'layer_norm'
-    'multi_box_head'
-    'nce'
-    'prelu'
-    'row_conv'
-    'spectral_norm',
-]
-
-from ..fluid.layers import fc, batch_norm, bilinear_tensor_product, \
-        conv2d, conv2d_transpose, conv3d, conv3d_transpose, create_parameter, \
-        crf_decoding, data_norm, deformable_conv, group_norm, hsigmoid, instance_norm, \
-        layer_norm, multi_box_head, nce, prelu, row_conv, spectral_norm
-
-from ..fluid.input import embedding
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 894ee5b9e8b1debb2c043de30314e8ebb94d3bc0..e2ef8e7092ad3f6af91c8d5d3c0b1deaed025514 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -13,10 +13,119 @@
 # limitations under the License.
 
 # TODO: define the functions to manipulate devices 
-# __all__ = ['cpu_places',
-#            'CPUPlace',
-#            'cuda_pinned_places',
-#            'cuda_places',
-#            'CUDAPinnedPlace',
-#            'CUDAPlace',
-#            'is_compiled_with_cuda']
+from paddle.fluid import core
+from paddle.fluid import framework
+import re
+
+__all__ = [
+    'get_cudnn_version',
+    'set_device',
+    'get_device'
+    #            'cpu_places',
+    #            'CPUPlace',
+    #            'cuda_pinned_places',
+    #            'cuda_places',
+    #            'CUDAPinnedPlace',
+    #            'CUDAPlace',
+    #            'is_compiled_with_cuda'
+]
+
+_cudnn_version = None
+
+
+def get_cudnn_version():
+    """
+    This funciton return the version of cudnn. the retuen value is int which represents the 
+    cudnn version. For example, if it return 7600, it represents the version of cudnn is 7.6.
+    
+    Returns:
+        int: A int value which represents the cudnn version. If cudnn version is not installed, it return None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+
+            cudnn_version = get_cudnn_version()
+
+
+
+    """
+    global _cudnn_version
+    if not core.is_compiled_with_cuda():
+        return None
+    if _cudnn_version is None:
+        cudnn_version = int(core.cudnn_version())
+        _cudnn_version = cudnn_version
+        if _cudnn_version < 0:
+            return None
+        else:
+            return cudnn_version
+    else:
+        return _cudnn_version
+
+
+def set_device(device):
+    """
+    Paddle supports running calculations on various types of devices, including CPU and GPU.
+    They are represented by string identifiers. This function can specify the global device
+    which the OP will run.
+
+    Parameters:
+        device(str): This parameter determines the specific running device.
+            It can be ``cpu`` or ``gpu:0``. When ``device`` is ``cpu``, the
+            program is running on the cpu. When ``device`` is ``gpu``, the
+            program is running ont the gpu.
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+        paddle.enable_imperative()
+        paddle.fluid.dygraph.set_device("gpu:0")
+        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
+        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
+        data = paddle.stack([x1,x2], axis=1)
+    """
+    lower_device = device.lower()
+    if lower_device == 'cpu':
+        place = core.CPUPlace()
+        framework._set_expected_place(place)
+    else:
+        avaliable_device = ((lower_device == 'cpu') or
+                            re.match(r'gpu:\d+', lower_device))
+        if not avaliable_device:
+            raise ValueError(
+                "The device must be a string which is like 'cpu' or 'gpu:0'")
+        device_info_list = device.split(':', 1)
+        device_id = device_info_list[1]
+        device_id = int(device_id)
+        place = core.CUDAPlace(device_id)
+        framework._set_expected_place(place)
+
+
+def get_device():
+    """
+    This funciton can get the current global device of the program is running.
+    It's a string which is like 'cpu' and 'gpu:0'. if the global device is not
+    set, it will return a string which is 'gpu:0' when cuda is avaliable or it 
+    will return a string which is 'cpu' when cuda is not avaliable.
+
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+        paddle.enable_imperative()
+        device = paddle.fluid.dygraph.get_device()
+
+    """
+    device = ''
+    place = framework._current_expected_place()
+    if isinstance(place, core.CPUPlace):
+        device = 'cpu'
+    elif isinstance(place, core.CUDAPlace):
+        device_id = place.get_device_id()
+        device = 'gpu:' + str(device_id)
+
+    return device
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index d0c32e26092f6ea25771279418582a24ea449ab2..34dd605f901b4357682dc514d59d110db74f9d5b 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .collective import *
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..c40ae7179395a2fc16ece0d68546221ce53c2180
--- /dev/null
+++ b/python/paddle/distributed/collective.py
@@ -0,0 +1,447 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
+from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.layers.tensor import fill_constant
+from ..fluid.layers import utils
+from ..fluid.dygraph.parallel import prepare_context
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+__all__ = [
+    'broadcast',
+    'all_reduce',
+    'reduce',
+    'all_gather',
+    'scatter',
+    'barrier',
+    'ReduceOp',
+]
+
+
+class ReduceOp:
+    """Reduce Operation"""
+    SUM = 0
+    MAX = 1
+    MIN = 2
+    PROD = 3
+
+
+class _Group():
+    """The abstract representation of group."""
+
+    def __init__(self, rank, rank_num):
+        self.rank = rank
+        self.nranks = rank_num
+
+
+_default_group = _Group(
+    int(os.getenv("PADDLE_TRAINER_ID", "0")),
+    int(os.getenv("PADDLE_TRAINERS_NUM", "1")))
+
+
+def broadcast(tensor, src, group=0):
+    """
+
+    Broadcast a tensor from the source to all others.
+
+    Args:
+        tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank.
+        group (int): The process group to work on. It is Optional.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.broadcast(data, 1)
+        out = data.numpy()
+        # [[1, 2, 3], [1, 2, 3]]
+    """
+    if in_dygraph_mode():
+        return core.ops.c_broadcast(tensor, tensor, 'root', src,
+                                    'use_calc_stream', True, 'ring_id', group)
+
+    op_type = 'c_broadcast'
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'broadcast')
+    if not isinstance(src, int) or not isinstance(group, int):
+        raise ValueError("Both the type of 'src' and 'group' for broadcast "
+                         "should be int.")
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'root': src,
+            'use_calc_stream': True,
+            'ring_id': group,
+        })
+
+
+def all_reduce(tensor, op=ReduceOp.SUM, group=0):
+    """
+
+    Reduce a tensor over all ranks so that all get the result.
+
+    Args:
+        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
+        group (int): Optional. The process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        from paddle.distributed import ReduceOp
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.all_reduce(data)
+        out = data.numpy()
+        # [[5, 7, 9], [5, 7, 9]]
+    """
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.MAX:
+            return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.MIN:
+            return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.PROD:
+            return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
+                                             True, 'ring_id', group)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'all_reduce')
+    if not op in [ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN, ReduceOp.PROD]:
+        raise ValueError("The op for all_reduce must be one of educeOp.PROD, "
+                         "ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN.")
+    if op == ReduceOp.SUM:
+        op_type = 'c_allreduce_sum'
+    elif op == ReduceOp.MAX:
+        op_type = 'c_allreduce_max'
+    elif op == ReduceOp.MIN:
+        op_type = 'c_allreduce_min'
+    elif op == ReduceOp.PROD:
+        op_type = 'c_allreduce_prod'
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for all_reduce should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': group,
+               'use_calc_stream': True})
+
+
+def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
+    """
+
+    Reduce a tensor to the destination from all others.
+
+    Args:
+        tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
+            should be float16, float32, float64, int32 or int64.
+        dst (int): The destination rank id.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.reduce(data, 0)
+        out = data.numpy()
+        # [[5, 7, 9], [5, 7, 9]]
+    """
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.MAX:
+            return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.MIN:
+            return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.PROD:
+            return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
+                                          True, 'ring_id', group, 'root_id',
+                                          dst)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    op_type = 'c_reduce'
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'all_reduce')
+    if not op in [ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN, ReduceOp.PROD]:
+        raise ValueError("The op for reduce must be one of educeOp.PROD, "
+                         "ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN.")
+
+    if op == ReduceOp.SUM:
+        op_type = 'c_reduce_sum'
+    elif op == ReduceOp.MAX:
+        op_type = 'c_reduce_max'
+    elif op == ReduceOp.MIN:
+        op_type = 'c_reduce_min'
+    elif op == ReduceOp.PROD:
+        op_type = 'c_reduce_prod'
+
+    if not isinstance(dst, int) or not isinstance(group, int):
+        raise ValueError("Both the type of 'dst' and 'group' for reduce "
+                         "should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': group,
+            'use_calc_stream': True,
+            'root_id': dst,
+        })
+
+
+def all_gather(tensor_list, tensor, group=0):
+    """
+
+    Gather tensors from all participators and all get the result.
+
+    Args:
+        tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        tensor (Tensor): The Tensor to send. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        tensor_list = []
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
+            np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            paddle.distributed.all_gather(tensor_list, data1)
+        else:
+            np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
+            np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            out = paddle.distributed.all_gather(tensor_list, data2)
+    """
+    op_type = 'c_allgather'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+    if in_dygraph_mode():
+        core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id',
+                             group, 'nranks', _default_group.nranks)
+    else:
+        if not isinstance(tensor_list, list):
+            raise ValueError("The type of 'tensor_list' for all_gather "
+                             "should be list.")
+        for elem in tensor_list:
+            check_variable_and_dtype(
+                elem, 'tensor_list',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'all_gather')
+        check_variable_and_dtype(
+            tensor, 'tensor',
+            ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
+        if not isinstance(group, int):
+            raise ValueError("The type of 'group' for all_gather "
+                             "should be int.")
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': group,
+                'use_calc_stream': True,
+                'nranks': _default_group.nranks
+            })
+
+    tensor_list.extend(paddle.split(out, _default_group.nranks, 0))
+
+
+def scatter(tensor, tensor_list=None, src=0, group=0):
+    """
+
+    Scatter a tensor to all participators.
+
+    Args:
+        tensor (Tensor): The output Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank id.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data1 = np.array([7, 8, 9])
+            np_data2 = np.array([10, 11, 12])
+        else:
+            np_data1 = np.array([1, 2, 3])
+            np_data2 = np.array([4, 5, 6])
+        data1 = paddle.to_tensor(np_data1)
+        data2 = paddle.to_tensor(np_data2)
+        if paddle.ParallelEnv().local_rank == 0:
+            paddle.distributed.scatter(data1, src=1)
+        else:
+            paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
+        out = data1.numpy()
+    """
+    op_type = 'c_scatter'
+    global _default_group
+    rank = _default_group.rank
+    nranks = _default_group.nranks
+    if rank != src:
+        tensor_list = []
+        for _ in range(nranks):
+            tensor_list.append(tensor)
+    temp = paddle.concat(tensor_list, axis=0)
+    if in_dygraph_mode():
+        return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True,
+                                  'ring_id', group, 'nranks',
+                                  _default_group.nranks, 'root', src)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'scatter')
+    if not isinstance(group, int) or not isinstance(src, int):
+        raise ValueError("Both the type of 'src' and 'group' for scatter "
+                         "should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': group,
+            'root': src,
+            'use_calc_stream': True,
+            'nranks': nranks,
+        })
+
+
+def barrier(group=0):
+    """
+
+    Barrier among all participators in the group.
+
+    Args:
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        paddle.distributed.barrier()
+    """
+    op_type = 'barrier'
+    temp = paddle.fill_constant([1], dtype="int32", value="1")
+    if in_dygraph_mode():
+        return core.ops.barrier(temp, temp, 'ring_id', group)
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for barrier must be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': group})
diff --git a/python/paddle/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
similarity index 77%
rename from python/paddle/fleet/__init__.py
rename to python/paddle/distributed/fleet/__init__.py
index cc5ce0f2b74b6193652b7ec7ed4e03407df296c5..b080fb17553d4a93a545f4ae781d786d82e26576 100644
--- a/python/paddle/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -12,16 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define distributed api under this directory, 
+# TODO: define distributed api under this directory,
+from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker
 from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
 from .dataset import *
-#from .base.role_maker import PaddleCloudRoleMaker
 
 __all__ = [
-    "DistributedStrategy", "UtilBase", "DatasetFactory", "DatasetBase",
-    "InMemoryDataset", "QueueDataset"
+    "DistributedStrategy",
+    "UtilBase",
+    "DatasetFactory",
+    "DatasetBase",
+    "InMemoryDataset",
+    "QueueDataset",
+    "UserDefinedRoleMaker",
+    "PaddleCloudRoleMaker",
 ]
 
 fleet = Fleet()
@@ -42,4 +48,6 @@ init_server = fleet.init_server
 run_server = fleet.run_server
 stop_worker = fleet.stop_worker
 distributed_optimizer = fleet.distributed_optimizer
+save_inference_model = fleet.save_inference_model
+save_persistables = fleet.save_persistables
 minimize = fleet.minimize
diff --git a/python/paddle/fleet/base/__init__.py b/python/paddle/distributed/fleet/base/__init__.py
similarity index 100%
rename from python/paddle/fleet/base/__init__.py
rename to python/paddle/distributed/fleet/base/__init__.py
diff --git a/python/paddle/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
old mode 100644
new mode 100755
similarity index 61%
rename from python/paddle/fleet/base/distributed_strategy.py
rename to python/paddle/distributed/fleet/base/distributed_strategy.py
index 43e50ca0bee6b324655f7dcfb5e5da2ebc0e85a8..5e527ea03ab9c816948f343ac103672a751fdbc3
--- a/python/paddle/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-from paddle.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable
+from paddle.distributed.fleet.proto import distributed_strategy_pb2
+from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
 
@@ -81,6 +81,8 @@ class DistributedJobInfo(object):
 
 
 class DistributedStrategy(object):
+    __lock_attr = False
+
     def __init__(self):
         """
         DistributedStrategy is the main configuration entry for distributed training of Paddle.
@@ -95,6 +97,13 @@ class DistributedStrategy(object):
 
         """
         self.strategy = distributed_strategy_pb2.DistributedStrategy()
+        self.__lock_attr = True
+
+    def __setattr__(self, key, value):
+        if self.__lock_attr and not hasattr(self, key):
+            raise TypeError("%s is not a attribute of %s" %
+                            (key, self.__class__.__name__))
+        object.__setattr__(self, key, value)
 
     def save_to_prototxt(self, output):
         """
@@ -103,7 +112,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.dgc = True
             strategy.recompute = True
@@ -120,7 +129,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.load_from_prototxt("dist_strategy.protoxt")
         """
@@ -141,7 +150,7 @@ class DistributedStrategy(object):
             exe_strategy.num_iteration_per_drop_scope = 10
             exe_strategy.num_iteration_per_run = 10
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.execution_strategy = exe_strategy
         """
         execution_strategy = paddle.fluid.ExecutionStrategy()
@@ -178,7 +187,7 @@ class DistributedStrategy(object):
             build_strategy.fuse_all_optimizer_ops = True
             build_strategy.enable_inplace = True
             
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.build_strategy = build_strategy
         """
 
@@ -211,7 +220,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             role_maker = fleet.PaddleCloudRoleMaker()
             fleet.init(role_maker)
 
@@ -227,8 +236,11 @@ class DistributedStrategy(object):
     def a_sync(self, flag):
         if isinstance(flag, bool):
             self.strategy.a_sync = flag
+            self.a_sync_configs = {"k_steps": 0}
         else:
-            print("WARNING: a_sync should have value of bool type")
+            raise ValueError(
+                "The type of `flag` is invalid, expected type is bool, but received %s".
+                format(type(flag)))
 
     @property
     def a_sync_configs(self):
@@ -250,7 +262,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             role_maker = fleet.PaddleCloudRoleMaker()
             fleet.init(role_maker)
 
@@ -279,7 +291,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.amp = True # by default this is false
 
@@ -295,6 +307,30 @@ class DistributedStrategy(object):
 
     @property
     def amp_configs(self):
+        """
+        Set automatic mixed precision training configurations. In general, amp has serveral configurable
+        settings that can be configured through a dict.
+
+        **Notes**:
+            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
+            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
+            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
+            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
+            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.amp = True
+            strategy.amp_configs = {
+                "init_loss_scaling": 32768,
+                "custom_white_list": ['conv2d']}
+        """
         return get_msg_dict(self.strategy.amp_configs)
 
     @amp_configs.setter
@@ -311,7 +347,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
             # suppose x and y are names of checkpoint tensors for recomputation
@@ -321,6 +357,17 @@ class DistributedStrategy(object):
 
     @property
     def sync_nccl_allreduce(self):
+        """
+        Indicating whether we are using synchronized all reduce in each communication thread
+        We note that system overhead is usually lower when sync_nccl_allreduce = True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sync_nccl_allreduce = True
+        """
         return self.strategy.sync_nccl_allreduce
 
     @sync_nccl_allreduce.setter
@@ -332,6 +379,18 @@ class DistributedStrategy(object):
 
     @property
     def use_hierarchical_allreduce(self):
+        """
+        Indicating whether we are using hierarchical allreduce in collective communication
+        Hierarchical allreduce often does allreduce within a certain node group and then do
+        allreduce among the leaders of each group
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.use_hierarchical_allreduce = True
+        """
         return self.strategy.use_hierarchical_allreduce
 
     @use_hierarchical_allreduce.setter
@@ -345,6 +404,17 @@ class DistributedStrategy(object):
 
     @property
     def hierarchical_allreduce_inter_nranks(self):
+        """
+        Number of ranks for low level node groups in hierarchical allreduce
+        Default value: number of GPU cards on each single GPU machine
+
+        Example:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.hierarchical_allreduce_inter_nranks = 8
+        """
         return self.strategy.hierarchical_allreduce_inter_nranks
 
     @hierarchical_allreduce_inter_nranks.setter
@@ -358,6 +428,19 @@ class DistributedStrategy(object):
 
     @property
     def sync_batch_norm(self):
+        """
+        Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
+        
+        Default value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sync_batch_norm = True
+        """
+
         return self.strategy.sync_batch_norm
 
     @sync_batch_norm.setter
@@ -369,6 +452,17 @@ class DistributedStrategy(object):
 
     @property
     def fuse_all_reduce_ops(self):
+        """
+        Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
+        Default value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_all_reduce_ops = False
+        """
         return self.strategy.fuse_all_reduce_ops
 
     @fuse_all_reduce_ops.setter
@@ -380,6 +474,18 @@ class DistributedStrategy(object):
 
     @property
     def fuse_grad_size_in_MB(self):
+        """
+        Specifying the size of gradient to fuse in Mega-Bytes
+
+        Default value: 32
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_grad_size_in_MB = 50
+        """
         return self.strategy.fuse_grad_size_in_MB
 
     @fuse_grad_size_in_MB.setter
@@ -404,6 +510,19 @@ class DistributedStrategy(object):
 
     @property
     def nccl_comm_num(self):
+        """
+        Specifying the number of NCCL communicator
+
+        Default value: 1
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.nccl_comm_num = 2
+        """
+
         return self.strategy.nccl_comm_num
 
     @nccl_comm_num.setter
@@ -429,7 +548,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
             strategy.recompute_configs = {"checkpionts": ["x", "y"]}
@@ -454,7 +573,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
 
@@ -487,7 +606,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
             strategy.pipeline_configs = {"micro_batch": 12}
@@ -504,6 +623,20 @@ class DistributedStrategy(object):
 
     @property
     def localsgd(self):
+        """
+        Indicating whether we are using Local SGD training. For more details, please refer to
+        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.localsgd = True # by default this is false
+
+        """
         return self.strategy.localsgd
 
     @localsgd.setter
@@ -515,6 +648,28 @@ class DistributedStrategy(object):
 
     @property
     def localsgd_configs(self):
+        """
+        Set LocalSGD training configurations. LocalSGD has a configurable
+        setting that can be configured through a dict.
+
+        **Notes**:
+            **k_steps(int)**: The local steps for training before parameter
+                synchronization. Default 1. If strategy.auto is set True, the
+                local steps will be calculated automatically during training.
+                The algorithm is referenced in this paper: 
+                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
+                In this case, k_steps indicates the first local steps which
+                is suggested setting to 1.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.localsgd = True
+            strategy.localsgd_configs = {"k_steps": 4}
+        """
+
         return get_msg_dict(self.strategy.localsgd_configs)
 
     @localsgd_configs.setter
@@ -525,6 +680,20 @@ class DistributedStrategy(object):
 
     @property
     def dgc(self):
+        """
+        Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
+        [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.dgc = True # by default this is false
+
+        """
         return self.strategy.dgc
 
     @dgc.setter
@@ -536,6 +705,28 @@ class DistributedStrategy(object):
 
     @property
     def dgc_configs(self):
+        """
+        Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
+        settings that can be configured through a dict.
+
+        **Notes**:
+            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
+            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
+                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
+                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
+                ends, it will use 0.999 then and after.
+            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
+                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
+                element will be transmitted.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.dgc = True
+            strategy.dgc_configs = {"rampup_begin_step": 1252}
+        """
         return get_msg_dict(self.strategy.dgc_configs)
 
     @dgc_configs.setter
@@ -557,7 +748,7 @@ class DistributedStrategy(object):
 
         Examples:
         .. code-block:: python
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
             strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
@@ -580,7 +771,7 @@ class DistributedStrategy(object):
             avg (bool): whether to average the gradients of each mini-batch,
                 the default value is `True`
         Example:
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
             strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
@@ -595,6 +786,20 @@ class DistributedStrategy(object):
 
     @property
     def lars(self):
+        """
+        Set lars configurations. lars is used to deal with the convergence problems when the global 
+        batch size is larger than 8k.  For more details, please refer to 
+        [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True # by default this is false
+        """
         return self.strategy.lars
 
     @lars.setter
@@ -606,6 +811,29 @@ class DistributedStrategy(object):
 
     @property
     def lars_configs(self):
+        """
+        Set Lars training configurations.
+
+        **Notes**:
+        **lars_coeff (float)**: trust ratio in lars formula.
+        **lars_weight_decay** (float): weight decay coefficient in lars formula.
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
+        when compute the local lr; 
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lars formula.
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True
+            strategy.lars_configs = {
+                        "lars_coeff": 0.01,
+                        "lars_weight_decay": 0.0005,
+                        "epsilon": 0,
+                        "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                    }
+        """
         return get_msg_dict(self.strategy.lars_configs)
 
     @lars_configs.setter
@@ -615,6 +843,22 @@ class DistributedStrategy(object):
 
     @property
     def lamb(self):
+        """
+        Set lamb configurations. lamb is used to deal with the convergence problems for large 
+        batch size training, specially for attention-related model like BERT. For more details, 
+        please refer to 
+        [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
+
+        Default Value: False
+        
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True # by default this is false
+        """
+
         return self.strategy.lamb
 
     @lamb.setter
@@ -624,6 +868,33 @@ class DistributedStrategy(object):
         else:
             print("WARNING: lamb should have value of bool type")
 
+    @property
+    def lamb_configs(self):
+        """
+        Set Lars training configurations.
+
+        **Notes**:
+        **lamb_weight_decay** (float): weight decay coefficient in lamb formula.
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lamb formula.
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                    'lamb_weight_decay': 0.01,
+                    'exclude_from_weight_decay': [],
+                }
+        """
+        return get_msg_dict(self.strategy.lamb_configs)
+
+    @lamb_configs.setter
+    def lamb_configs(self, configs):
+        check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs")
+        assign_configs_value(self.strategy.lamb_configs, configs)
+
     @property
     def elastic(self):
         return self.strategy.elastic
@@ -646,6 +917,68 @@ class DistributedStrategy(object):
         else:
             print("WARNING: auto should have value of bool type")
 
+    @property
+    def cudnn_exhaustive_search(self):
+        return self.strategy.cudnn_exhaustive_search
+
+    @cudnn_exhaustive_search.setter
+    def cudnn_exhaustive_search(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_exhaustive_search = flag
+        else:
+            print(
+                "WARNING: cudnn_exhaustive_search should have value of bool type"
+            )
+
+    @property
+    def conv_workspace_size_limit(self):
+        return self.strategy.conv_workspace_size_limit
+
+    @conv_workspace_size_limit.setter
+    def conv_workspace_size_limit(self, value):
+        if isinstance(value, int):
+            self.strategy.conv_workspace_size_limit = value
+        else:
+            print(
+                "WARNING: conv_workspace_size_limit should have value of int type"
+            )
+
+    @property
+    def cudnn_batchnorm_spatial_persistent(self):
+        return self.strategy.cudnn_batchnorm_spatial_persistent
+
+    @cudnn_batchnorm_spatial_persistent.setter
+    def cudnn_batchnorm_spatial_persistent(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_batchnorm_spatial_persistent = flag
+        else:
+            print(
+                "WARNING: cudnn_batchnorm_spatial_persistent should have value of bool type"
+            )
+
+    def _enable_env(self):
+        strategy = self.strategy
+        keys = [
+            "FLAGS_cudnn_batchnorm_spatial_persistent",
+            "FLAGS_conv_workspace_size_limit",
+            "FLAGS_cudnn_exhaustive_search",
+            "FLAGS_sync_nccl_allreduce",
+            "FLAGS_fuse_parameter_memory_size",
+            "FLAGS_fuse_parameter_groups_size",
+        ]
+        values = [
+            bool(strategy.cudnn_batchnorm_spatial_persistent),
+            int(strategy.conv_workspace_size_limit),
+            bool(strategy.cudnn_exhaustive_search),
+            bool(strategy.sync_nccl_allreduce),
+            int(strategy.fuse_grad_size_in_MB),
+            int(strategy.fuse_grad_size_in_TFLOPS),
+        ]
+
+        for i, key in enumerate(keys):
+            if core.globals().is_public(key):
+                core.globals()[key] = values[i]
+
     def __repr__(self):
         fields = self.strategy.DESCRIPTOR.fields
         for f in fields:
diff --git a/python/paddle/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
similarity index 77%
rename from python/paddle/fleet/base/fleet_base.py
rename to python/paddle/distributed/fleet/base/fleet_base.py
index 979b878a3df966a3af59cee126b884361f5b6ac7..a6286bcca87fad1afddbd8af1e56dad05dab2578 100644
--- a/python/paddle/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -14,14 +14,32 @@
 
 from __future__ import print_function
 import paddle
+from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
 from .strategy_compiler import StrategyCompiler
+from .distributed_strategy import DistributedStrategy
 from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
+from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = ['Fleet']
 
 
+def _inited_runtime_handler_(func):
+    def __impl__(*args, **kwargs):
+        cls = args[0]
+
+        if cls._runtime_handle is None:
+            raise ValueError("Fleet can not find suitable runtime handler")
+
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
+
+
 class Fleet(object):
     """
     Unified API for distributed training of PaddlePaddle
@@ -34,9 +52,8 @@ class Fleet(object):
     Examples:
         .. code-block:: python
 
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            import paddle.distributed.fleet as fleet
+            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
@@ -58,10 +75,35 @@ class Fleet(object):
     def __init__(self):
         self._runtime_handle = None
         self._util = None
+        self._role_maker = None
+        self._is_collective = False
+
+    def init(self, role_maker=None, is_collective=False):
+        """
+        Initialize role_maker in Fleet.
 
-    def init(self, role_maker):
-        self._role_maker = role_maker
+        This function is responsible for the distributed architecture 
+        what you want to run your code behind,such as Transpiler,
+        Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker 
+        
+        """
+        if isinstance(role_maker, RoleMakerBase):
+            self._role_maker = role_maker
+        elif role_maker == None:
+            if isinstance(is_collective, bool):
+                self._is_collective = is_collective
+                self._role_maker = PaddleCloudRoleMaker(
+                    is_collective=self._is_collective)
+            else:
+                raise ValueError(
+                    "Something wrong occurred, please check whether is_collective is bool value"
+                )
+        else:
+            raise ValueError(
+                "Something wrong occurred, please check whether rolemaker is instance of RoleMakerBase"
+            )
         self.strategy_compiler = StrategyCompiler()
+        return None
 
     def is_first_worker(self):
         """
@@ -182,35 +224,49 @@ class Fleet(object):
         """
         self._role_maker.barrier_worker()
 
+    @inited_runtime_handler
     def init_worker(self):
         """
         init worker
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._init_worker()
 
-    def init_server(self, model_dir=None):
+    @inited_runtime_handler
+    def init_server(self, *args, **kwargs):
         """
         init server
         """
-        assert self._runtime_handle is not None
-        self._runtime_handle._init_server()
+        self._runtime_handle._init_server(*args, **kwargs)
 
+    @inited_runtime_handler
     def run_server(self):
         """
         run server
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._run_server()
 
+    @inited_runtime_handler
     def stop_worker(self):
         """
         stop worker
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._stop_worker()
 
-    def distributed_optimizer(self, optimizer, strategy):
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names,
+                             target_vars,
+                             main_program=None,
+                             export_for_deployment=True):
+        self._runtime_handle._save_inference_model(
+            executor, dirname, feeded_var_names, target_vars, main_program,
+            export_for_deployment)
+
+    def save_persistables(self, executor, dirname, main_program=None):
+        self._runtime_handle._save_persistables(executor, dirname, main_program)
+
+    def distributed_optimizer(self, optimizer, strategy=None):
         """
         distirbuted_optimizer
         Returns:
@@ -218,15 +274,16 @@ class Fleet(object):
 
         Examples:
             .. code-block:: python
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            import paddle.distributed.fleet as fleet
+            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
             optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         """
         self.user_defined_optimizer = optimizer
+        if strategy == None:
+            strategy = DistributedStrategy()
         self.user_defined_strategy = strategy
         self.valid_strategy = None
         return self
@@ -260,8 +317,7 @@ class Fleet(object):
 
         Examples:
             import paddle
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+            import paddle.distributed.fleet as fleet
 
             fc_1 = paddle.layers.fc(input=input_x, size=hid_dim, act='tanh')
             fc_2 = paddlen.layers.fc(input=fc_1, size=hid_dim, act='tanh')
@@ -269,7 +325,7 @@ class Fleet(object):
             cost = paddle.layers.cross_entropy(input=prediction, label=input_y)
             avg_cost = paddle.layers.mean(x=cost)
 
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
@@ -286,11 +342,12 @@ class Fleet(object):
         context["loss"] = loss
         if startup_program == None:
             self.origin_startup_program = \
-                paddle.default_startup_program().clone(for_test=False)
-            startup_program = paddle.default_startup_program()
+                paddle.static.default_startup_program().clone(for_test=False)
+            startup_program = paddle.static.default_startup_program()
         else:
             self.origin_startup_program = \
                 startup_program.clone(for_test=False)
+
         context["origin_startup_program"] = startup_program
         context["role_maker"] = self._role_maker
 
@@ -326,15 +383,23 @@ class Fleet(object):
         context["valid_strategy"] = valid_strategy
 
         self.valid_strategy = valid_strategy
+        self.valid_strategy._enable_env()
 
         optimize_ops = []
         params_grads = []
+
         if meta_optimizer:
             optimize_ops, params_grads = meta_optimizer.minimize(
                 loss,
                 startup_program=startup_program,
                 parameter_list=parameter_list,
                 no_grad_set=no_grad_set)
+
+            default_program = paddle.static.default_main_program()
+
+            if id(default_program) != id(loss.block.program):
+                paddle.fluid.framework.switch_main_program(loss.block.program)
+
         else:
             optimize_ops, params_grads = self.user_defined_optimizer.minimize(
                 loss,
@@ -344,6 +409,7 @@ class Fleet(object):
 
         context["program_optimize_ops"] = optimize_ops
         context["program_params_grads"] = params_grads
+
         if graph_optimizer:
             optimize_ops, params_grads = graph_optimizer.minimize(
                 loss,
diff --git a/python/paddle/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
similarity index 61%
rename from python/paddle/fleet/base/meta_optimizer_factory.py
rename to python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 802f6c4dab7f3a98cc11d9bb1956db5ee33b2746..459070fcc4dbef3711c33b2932e8f1c88647aab5 100755
--- a/python/paddle/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,27 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..meta_optimizers import AMPOptimizer
-from ..meta_optimizers import RecomputeOptimizer
-from ..meta_optimizers import GradientMergeOptimizer
-from ..meta_optimizers import GraphExecutionOptimizer
-from ..meta_optimizers import PipelineOptimizer
-from ..meta_optimizers import LocalSGDOptimizer
-from ..meta_optimizers import LarsOptimizer
-from ..meta_optimizers import DGCOptimizer
-
 __all__ = ["MetaOptimizerFactory"]
 
-meta_optimizer_names = [
-    "AMPOptimizer",
-    "RecomputeOptimizer",
-    "GradientMergeOptimizer",
-    "GraphExecutionOptimizer",
-    "PipelineOptimizer",
-    "LocalSGDOptimizer",
-    "LarsOptimizer",
-    "DGCOptimizer",
-]
+from ..meta_optimizers import *
+
+meta_optimizer_names = list(
+    filter(lambda name: name.endswith("Optimizer"), dir()))
 
 
 class MetaOptimizerFactory(object):
diff --git a/python/paddle/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
similarity index 100%
rename from python/paddle/fleet/base/private_helper_function.py
rename to python/paddle/distributed/fleet/base/private_helper_function.py
diff --git a/python/paddle/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
similarity index 90%
rename from python/paddle/fleet/base/role_maker.py
rename to python/paddle/distributed/fleet/base/role_maker.py
index b3e8120af6f855bb6dba157af107f4ca7ca3b3a4..6aeeb4a2896ea1d20390e463937aa07d3edd0204 100644
--- a/python/paddle/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -110,6 +110,14 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
+    def node_num(self):
+        """
+        Get the training node number
+        Returns:
+            int: node num
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
     def get_trainer_endpoints(self):
         """
         return trainer endpoints
@@ -157,10 +165,10 @@ class RoleMakerBase(object):
 
 
 class PaddleCloudRoleMaker(RoleMakerBase):
-    def __init__(self, is_collective=False, init_gloo=True, **kwargs):
+    def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = init_gloo
+        self._init_gloo = False  #default no init gloo
         self._kwargs = kwargs
 
         self._role_is_generated = False
@@ -196,30 +204,35 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self._prefix = os.getenv("SYS_JOB_ID", "")
 
     def _barrier(self, comm_world):
-        if comm_world:
+        if isinstance(comm_world, fluid.core.Gloo):
             comm_world.barrier()
+        else:
+            print("warning: must init Gloo before using _barrier() function")
 
     def _all_gather(self, comm_world, input):
-        if comm_world:
+        if isinstance(comm_world, fluid.core.Gloo):
             self._barrier(comm_world)
             output = comm_world.all_gather(input)
             return output
         else:
+            print("warning: must init Gloo before using _all_gather() function")
             return None
 
     def _all_reduce(self, comm_world, input, mode="sum"):
-        if not comm_world:
-            return None
+        if isinstance(comm_world, fluid.core.Gloo):
 
-        input = np.array(input)
+            input = np.array(input)
 
-        input_shape = input.shape
-        input_list = input.reshape(-1).tolist()
+            input_shape = input.shape
+            input_list = input.reshape(-1).tolist()
 
-        self._barrier(comm_world)
-        ans = comm_world.all_reduce(input_list, mode)
-        output = np.array(ans).reshape(input_shape)
-        return output
+            self._barrier(comm_world)
+            ans = comm_world.all_reduce(input_list, mode)
+            output = np.array(ans).reshape(input_shape)
+            return output
+        else:
+            print("warning: must init Gloo before using _all_reduce() function")
+            return None
 
     def is_worker(self):
         """
@@ -286,6 +299,14 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self.generate_role()
         return self._trainers_num
 
+    def node_num(self):
+        """
+        return the training node number
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._node_num
+
     def get_trainer_endpoints(self):
         """
         get endpoint of all trainers
@@ -353,6 +374,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._trainers_num = trainers_num
         self._role = role
         self._current_id = current_id
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _collective_env(self):
         self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
@@ -363,6 +386,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _init_gloo_env(self):
         def init_gloo_instance(role="trainer"):
@@ -440,6 +465,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         if not self._role_is_generated:
             if not self._is_collective:
                 self._ps_env()
+                if "PADDLE_WITH_GLOO" in os.environ:
+                    self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
                 if self._init_gloo:
                     self._init_gloo_env()
             else:
@@ -481,7 +508,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         return "lo"
 
     def __start_kv_server(self, http_server_d, size_d):
-        from paddle.fleet.utils import KVServer
+        from paddle.distributed.fleet.utils import KVServer
         http_server = KVServer(int(self._http_ip_port[1]), size_d)
         http_server.start()
         wait_seconds = 5
@@ -513,12 +540,16 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
             self._cur_endpoint = self._worker_endpoints[self._current_id]
         elif self._role == Role.SERVER:
             self._cur_endpoint = self._server_endpoints[self._current_id]
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _user_defined_collective_env(self):
         self._worker_endpoints = self._kwargs.get("worker_endpoints")
         self._current_id = self._kwargs.get("current_id")
         self._trainers_num = len(self._worker_endpoints)
         self._training_role = Role.Worker
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def generate_role(self):
         """
diff --git a/python/paddle/fleet/base/runtime_factory.py b/python/paddle/distributed/fleet/base/runtime_factory.py
similarity index 74%
rename from python/paddle/fleet/base/runtime_factory.py
rename to python/paddle/distributed/fleet/base/runtime_factory.py
index 45dca6dae4e065ba6f2a9f09ac8cf298222b2d15..68d327c2280d01507db8798a80ed19c4eb3a0f4c 100644
--- a/python/paddle/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..runtime.collective_runtime import CollectiveRuntime
+from ..runtime.parameter_server_runtime import ParameterServerRuntime
 
 
 class RuntimeFactory(object):
@@ -23,3 +24,9 @@ class RuntimeFactory(object):
             collective_runtime = CollectiveRuntime()
             collective_runtime._set_basic_info(context)
             return collective_runtime
+
+        k_steps = context["valid_strategy"].a_sync_configs["k_steps"]
+        if not context["role_maker"]._is_collective and k_steps >= 0:
+            ps_runtime = ParameterServerRuntime()
+            ps_runtime._set_basic_info(context)
+            return ps_runtime
diff --git a/python/paddle/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
similarity index 75%
rename from python/paddle/fleet/base/strategy_compiler.py
rename to python/paddle/distributed/fleet/base/strategy_compiler.py
index f0e23713e4f3f98217280f2cbe071bf1e23c823e..4097fc1237f8d7616101810f994c243dffb2cd67 100644
--- a/python/paddle/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -76,6 +76,18 @@ class StrategyCompiler(StrategyCompilerBase):
             opt._disable_strategy(valid_strategy)
         return valid_strategy
 
+    """
+    Meta Optimizer Type A: rewrite forward, backward. e.g. recompute, async, sync, pipeline.
+                           results will be splitted in async, sync, pipeline
+    Meta Optimizer Type B: rewrite forward, 
+                           e.g. AMP and the corresponding backward is generated by rewritten forward
+    Meta Opitmizer Type B: rewrite backward. e.g. gradient fusion
+    Meta Optimizer Type D: rewrite optimize. e.g. lars, lamb, localsgd, gradient merge, dgc
+    Meta Optimizer Type E: only transpile to Graph structure for runtime,
+                           currently, grad fusion and kernel fusion, sync batch-norm included.
+                           we will remove grad fusion and sync batch-norm
+    """
+
     def generate_optimizer(self, loss, role_maker, optimizer,
                            user_defined_strategy, meta_optimizer_list,
                            graph_optimizer_list):
@@ -102,4 +114,18 @@ class StrategyCompiler(StrategyCompilerBase):
                 0]
             return_graph = None if graph_optimizers == None else graph_optimizers[
                 0]
+
+            if meta_optimizers == None or graph_optimizers == None:
+                return return_meta, return_graph
+
+            # do heuristic filter here, if any meta optimizer in graph optimizers is in 
+            # any meta optimizers' black list, set return_graph to None
+            need_graph_opt = True
+            for graph_opt in graph_optimizers:
+                for program_opt in meta_optimizers:
+                    if graph_opt.__class__.__name__ in program_opt.meta_optimizers_black_list:
+                        need_graph_opt = False
+            if not need_graph_opt:
+                return_graph = None
+
             return return_meta, return_graph
diff --git a/python/paddle/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
similarity index 99%
rename from python/paddle/fleet/base/util_factory.py
rename to python/paddle/distributed/fleet/base/util_factory.py
index ed2a8db586aa9c33a3aeed51b77af98e11b4dc5f..f5a6c417c0c45bea819c5832f98b5b6c9fabbd4b 100644
--- a/python/paddle/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -55,8 +55,8 @@ class UtilBase(object):
 
     def set_file_system(self, fs_client):
         assert isinstance(
-            fs_client,
-            FS), "fs_client must be the instance of paddle.fleet.utils.FS"
+            fs_client, FS
+        ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
         self.fs_client = fs_client
 
     def __check_comm_world(self, comm_world="worker"):
diff --git a/python/paddle/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
similarity index 97%
rename from python/paddle/fleet/cloud_utils.py
rename to python/paddle/distributed/fleet/cloud_utils.py
index 72c306fe3b91531b6f7f39134bf4abd86c686dee..49d66118d902e43f7ee0c4003c516081092b2a97 100644
--- a/python/paddle/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -14,7 +14,7 @@
 
 import os
 import paddle
-from paddle.fleet.launch_utils import get_cluster, logger
+from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
 
 def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
diff --git a/python/paddle/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
similarity index 100%
rename from python/paddle/fleet/dataset/__init__.py
rename to python/paddle/distributed/fleet/dataset/__init__.py
diff --git a/python/paddle/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
similarity index 100%
rename from python/paddle/fleet/dataset/dataset.py
rename to python/paddle/distributed/fleet/dataset/dataset.py
diff --git a/python/paddle/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
similarity index 60%
rename from python/paddle/fleet/launch.py
rename to python/paddle/distributed/fleet/launch.py
index de5e0b66b3e41818875f84e4ba5dd0557bfdb02f..29a1bda92f17443e6c38b070379481aaa419b1d4 100644
--- a/python/paddle/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-paddle.distributed.launch is a module that spawns multiple distributed 
+fleetrun is a module that spawns multiple distributed
 process on each training node for gpu training and cpu training.
 Usage:
-    In both of single node training or multiple node training, this module 
+    In both of single node training or multiple node training, this module
 launch a process on each of the given gpu card or cpu machine.
     GPU training:
     1. for single node training with all visible gpu cards:
@@ -24,24 +24,33 @@ launch a process on each of the given gpu card or cpu machine.
        fleetrun --gpus="0,1,2,3" your_training_py (arg1 arg2 and all others)
     3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
         on 192.168.0.16:
-            fleetrun --ips="192.168.0.16,192.168.0.17" --node_ip=192.168.0.16 \
+            fleetrun --ips="192.168.0.16,192.168.0.17" \
                 your_training_py (arg1 arg2 and all others)
         on 192.168.0.17:
             fleetrun --ips="192.168.0.16,192.168.0.17" \
-                --node_ip=192.168.0.17 \
                 your_training_py (arg1 arg2 and all others)
     CPU training:
     1. for single node training with multi servers and workers:
-        fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others)
+        fleetrun --server_num=2 --worker_num=2 your_training_py (arg1 arg2 and all others)
     2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
-        with 2 servers and  4 workers.
+        with 2 servers and 4 workers.
         on 192.168.0.16:
-            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
-                --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
+            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
+                --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
                 your_training_py (arg1 arg2 and all others)
         on 192.168.0.17:
             fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
-                --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
+                --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
+                your_training_py (arg1 arg2 and all others)
+    3. use gloo backend for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
+        with 2 servers and 4 workers. (workers should set port)
+        on 192.168.0.16:
+            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
+                --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
+                your_training_py (arg1 arg2 and all others)
+        on 192.168.0.17:
+            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
+                --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
                 your_training_py (arg1 arg2 and all others)
 """
 
@@ -57,8 +66,8 @@ from argparse import ArgumentParser, REMAINDER
 import paddle
 import paddle.fluid as fluid
 
-from paddle.fleet.launch_utils import *
-import paddle.fleet.cloud_utils as cloud_utils
+from paddle.distributed.fleet.launch_utils import *
+import paddle.distributed.fleet.cloud_utils as cloud_utils
 
 
 def _print_arguments(args):
@@ -96,15 +105,14 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "--servers", type=str, default="", help="User defined servers ip:port")
     parser.add_argument(
         "--workers", type=str, default="", help="User defined workers ip:port")
-    parser.add_argument(
-        "--worker_num", type=int, default=2, help="number of workers")
+    parser.add_argument("--worker_num", type=int, help="number of workers")
 
-    parser.add_argument(
-        "--server_num", type=int, default=2, help="number of servers")
+    parser.add_argument("--server_num", type=int, help="number of servers")
 
     parser.add_argument(
         "--log_dir",
         type=str,
+        default="log",
         help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
     #positional
@@ -129,11 +137,11 @@ def get_cluster_from_args(args, gpus):
         _, node_ip = get_host_name_ip()
 
     # node_ip = args.node_ip
-    assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips:{%s}" \
+    assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
                 % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
-    logger.debug("parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
+    logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
         node_ips, node_ip, node_rank))
 
     free_ports = None
@@ -187,8 +195,11 @@ def launch_collective(args):
     cluster = None
     pod = None
 
+    start_port = 6170
+    if os.environ.get('FLAGS_START_PORT') is not None:
+        start_port = os.environ.get('FLAGS_START_PORT')
     if cloud_utils.use_paddlecloud() and trainers_num != 1:
-        cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus)
+        cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port)
         logger.info("get cluster from cloud:{}".format(cluster))
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
@@ -213,11 +224,87 @@ def launch_collective(args):
 
 
 def launch_ps(args):
-    worker_num = args.worker_num
-    server_num = args.server_num
+    ports = None
     start_port = 6170
-    if os.environ.get('FLAGS_START_PORT') is not None:
-        start_port = os.environ.get('FLAGS_START_PORT')
+    if args.server_num:
+        server_num = args.server_num
+        ports = get_ports(server_num, 0)
+        server_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
+    else:
+        assert args.servers != "", "The setting of CPU mode must be either server_num or servers."
+        server_endpoints = args.servers
+    server_endpoints_ips = [
+        x.strip().split(":")[0] for x in server_endpoints.split(",")
+    ]
+    server_endpoints_port = [
+        x.strip().split(":")[1] for x in server_endpoints.split(",")
+    ]
+    server_num = len(server_endpoints_ips)
+
+    if args.worker_num:
+        worker_num = args.worker_num
+        ports = get_ports(worker_num, server_num)
+        worker_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
+    else:
+        assert args.workers != "", "The setting of CPU mode must be either worker_num or workers."
+        worker_endpoints = args.workers
+    worker_endpoints_ips = [
+        x.strip().split(":")[0] for x in worker_endpoints.split(",")
+    ]
+    worker_num = len(worker_endpoints_ips)
+    node_ips = list(set(server_endpoints_ips + worker_endpoints_ips))
+    worker_endpoints_len = [
+        len(x.strip().split(":")) for x in worker_endpoints.split(",")
+    ]
+    if 1 in worker_endpoints_len:
+        # if no port value in worker_endpoints, will set default port values.
+        worker_endpoints_port = range(start_port + server_num,
+                                      start_port + server_num + worker_num, 1)
+    else:
+        worker_endpoints_port = [
+            x.strip().split(":")[1] for x in worker_endpoints.split(",")
+        ]
+
+    # local train
+    if len(set(node_ips)) == 1:
+        current_node_ip = node_ips[0]
+    else:
+        _, current_node_ip = get_host_name_ip()
+
+    assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
+                % (current_node_ip, node_ips)
+    node_rank = node_ips.index(current_node_ip)
+    logger.debug(
+        "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
+        format(node_ips, current_node_ip, node_rank, server_endpoints_port))
+
+    cluster = Cluster(hdfs=None)
+    server_rank = 0
+    worker_rank = 0
+    for node_rank, ip in enumerate(node_ips):
+        pod = Pod()
+        pod.rank = node_rank
+        pod.addr = ip
+        for i in range(len(server_endpoints_ips)):
+            if ip == server_endpoints_ips[i]:
+                server = Trainer()
+                server.endpoint = "%s:%s" % (ip, server_endpoints_port[i])
+                server.rank = server_rank
+                server_rank += 1
+                pod.servers.append(server)
+        for j in range(len(worker_endpoints_ips)):
+            if ip == worker_endpoints_ips[j]:
+                worker = Trainer()
+                worker.endpoint = "%s:%s" % (ip, worker_endpoints_port[i])
+                worker.rank = worker_rank
+                worker_rank += 1
+                pod.workers.append(worker)
+
+        cluster.pods.append(pod)
+
+    pod_rank = node_ips.index(current_node_ip)
+    pod = cluster.pods[pod_rank]
+
     default_env = os.environ.copy()
     current_env = copy.copy(default_env)
     current_env.pop("http_proxy", None)
@@ -225,68 +312,78 @@ def launch_ps(args):
     procs = []
     cmds = []
     log_fns = []
-    ports = range(start_port, start_port + server_num, 1)
-    default_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    user_endpoints = ""
-    if args.servers == "":
-        user_endpoints = default_endpoints
-    else:
-        user_endpoints = args.servers
-    user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")]
-    user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")]
-    for i in range(server_num):
+    for idx, cur_server in enumerate(pod.servers):
         current_env.update({
-            "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
-            "PADDLE_PORT": user_endpoints_port[i],
+            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": user_endpoints_ips[i]
+            "POD_IP": cur_server.endpoint.split(":")[0]
         })
 
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/serverlog.%d" % (args.log_dir, i), "w")
+            fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
             log_fns.append(fn)
             proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
         else:
             proc = subprocess.Popen(cmd, env=current_env)
-        procs.append(proc)
 
-    for i in range(worker_num):
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = cur_server.rank
+        tp.local_rank = idx
+        tp.log_fn = fn
+        tp.log_offset = 0 if fn else None
+        tp.cmd = cmd
+
+        procs.append(tp)
+
+    for idx, cur_worker in enumerate(pod.workers):
         current_env.update({
-            "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
+            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(i)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank)
         })
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
+            fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
             log_fns.append(fn)
             proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
         else:
             proc = subprocess.Popen(cmd, env=current_env)
-        procs.append(proc)
+
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = cur_worker.rank
+        tp.local_rank = idx
+        tp.log_fn = fn
+        tp.log_offset = 0 if fn else None
+        tp.cmd = cmd
+
+        procs.append(tp)
 
     # only wait worker to finish here
     for i, proc in enumerate(procs):
-        if i < server_num:
+        if i < len(pod.servers):
             continue
-        procs[i].wait()
+        procs[i].proc.wait()
         if len(log_fns) > 0:
             log_fns[i].close()
 
     print("all workers exit, going to finish parameter server", file=sys.stderr)
-    for i in range(server_num):
+    for i in range(len(pod.servers)):
         if len(log_fns) > 0:
             log_fns[i].close()
-        procs[i].terminate()
+        procs[i].proc.terminate()
     print("all parameter server are killed", file=sys.stderr)
 
 
@@ -303,11 +400,15 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
-    if len(has_ps_args) > 0 or fluid.core.get_cuda_device_count() == 0:
-        logger.info("Run cpu parameter-sever mode.")
+    cuda_device_num = fluid.core.get_cuda_device_count()
+    if len(has_ps_args) > 0 or cuda_device_num == 0:
+        logger.info(
+            "Run parameter-sever cpu mode. pserver args:{}, cuda count:{}".
+            format(has_ps_args, cuda_device_num))
         launch_ps(args)
     elif len(has_collective_args) > 0:
-        logger.info("Run gpu collective mode.")
+        logger.info("Run collective gpu mode. gpu args:{}, cuda count:{}".
+                    format(has_collective_args, cuda_device_num))
         launch_collective(args)
     else:
         logger.warning(
diff --git a/python/paddle/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
similarity index 88%
rename from python/paddle/fleet/launch_utils.py
rename to python/paddle/distributed/fleet/launch_utils.py
index 040e7254f8c5b23465e4c65f27910f773ea62921..350d8ae2b44db3e8f8e6b00d95c2b7a9ca91f88b 100644
--- a/python/paddle/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -142,12 +142,16 @@ class Pod(object):
         self.addr = None
         self.port = None
         self.trainers = []
+        self.servers = []
+        self.workers = []
         self.gpus = []
 
     def __str__(self):
-        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
-            self.rank, self.id, self.addr, self.port, self.gpus,
-            [str(t) for t in self.trainers])
+        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
+            workers:{}".format(self.rank, self.id, self.addr, self.port,
+                               self.gpus, [str(t) for t in self.trainers],
+                               [str(s) for s in self.servers],
+                               [str(w) for w in self.workers])
 
     def __eq__(self, pod):
         if self.rank != pod.rank or \
@@ -168,6 +172,26 @@ class Pod(object):
                                                        pod.trainers[i]))
                 return False
 
+        if len(self.servers) != len(pod.servers):
+            logger.debug("servers {} != {}".format(self.servers, pod.servers))
+            return False
+
+        for i in range(len(self.servers)):
+            if self.servers[i] != pod.servers[i]:
+                logger.debug("servers {} != {}".format(self.servers[i],
+                                                       pod.servers[i]))
+                return False
+
+        if len(self.workers) != len(pod.workers):
+            logger.debug("workers {} != {}".format(self.workers, pod.workers))
+            return False
+
+        for i in range(len(self.workers)):
+            if self.workers[i] != pod.workers[i]:
+                logger.debug("workers {} != {}".format(self.workers[i],
+                                                       pod.workers[i]))
+                return False
+
         return True
 
     def __ne__(self, pod):
@@ -303,6 +327,17 @@ def find_free_ports(num):
     return None
 
 
+def get_ports(num, offset):
+    if os.environ.get('FLAGS_START_PORT') is None:
+        ports = find_free_ports(num)
+        if ports is not None:
+            ports = list(ports)
+    else:
+        start_port = os.environ.get('FLAGS_START_PORT')
+        ports = range(start_port + offset, start_port + offset + num, 1)
+    return ports
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
diff --git a/python/paddle/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
old mode 100755
new mode 100644
similarity index 80%
rename from python/paddle/fleet/meta_optimizers/__init__.py
rename to python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 718805c5aadaf3476fa1fc495a355395fec6396d..075e8b6c4302d792606849fc2981e46ccead1e56
--- a/python/paddle/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -15,17 +15,24 @@ from .amp_optimizer import AMPOptimizer
 from .recompute_optimizer import RecomputeOptimizer
 from .gradient_merge_optimizer import GradientMergeOptimizer
 from .graph_execution_optimizer import GraphExecutionOptimizer
+from .async_optimizer import AsyncMetaOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
+from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
 from .dgc_optimizer import DGCOptimizer
+from .lamb_optimizer import LambOptimizer
 
 __all__ = [
     'AMPOptimizer',
     'RecomputeOptimizer',
     'GradientMergeOptimizer',
+    'AsyncMetaOptimizer',
+    'GraphExecutionOptimizer',
     'PipelineOptimizer',
     'LocalSGDOptimizer',
     'LarsOptimizer',
+    'AsyncGraphExecutionOptimizer',
     'DGCOptimizer',
+    'LambOptimizer',
 ]
diff --git a/python/paddle/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
similarity index 89%
rename from python/paddle/fleet/meta_optimizers/amp_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 8316d807fa87062a8e3fba0bcb3bd057d2231032..66db14209b4c57475c30c6dde083593e27f04ea0 100644
--- a/python/paddle/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -23,7 +23,12 @@ class AMPOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.amp_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
+            "LocalSGDOptimizer", "GradientMergeOptimizer",
+            "GraphExecutionOptimizer"
+        ]
+        self.meta_optimizers_black_list = ["DGCOptimizer"]
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -37,6 +42,7 @@ class AMPOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.amp = False
+        dist_strategy.amp_configs = {}
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0dee220aafd07bf69a198c6b03e6c957c50d4ce
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle import fluid
+from paddle.fluid import compiler
+from .async_optimizer import AsyncMetaOptimizer
+
+
+class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
+    def __init__(self, optimizer):
+        super(AsyncGraphExecutionOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+
+    def _can_apply(self):
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        if k_steps < 0:
+            return False
+
+        if self.role_maker.is_server():
+            return False
+
+        return True
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.a_sync_configs = {}
+
+    def _is_graph_out(self):
+        return True
+
+    def _try_to_compile(self, main_program, loss):
+        dist_strategy = self._get_distributed_strategy()
+
+        build_strategy = dist_strategy.get_build_strategy()
+        exec_strategy = dist_strategy.get_execute_strategy()
+
+        self._compiled_program = compiler.CompiledProgram(main_program)
+
+        self._compiled_program.with_data_parallel(
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy,
+            share_vars_from=None)
+
+        return self._compiled_program
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        program = loss.block.program
+        compiled_program = self._try_to_compile(program, loss)
+        program._graph = compiled_program
+        # just return self.optimizer_ops and self.param_grads
+        return None, None
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b65435497284d279ebdea026e7ac88883a724c7c
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle import fluid
+from .meta_optimizer_base import MetaOptimizerBase
+
+
+class AsyncMetaOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(AsyncMetaOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+
+    def _is_graph_out(self):
+        return False
+
+    def _can_apply(self):
+        if self.role_maker._is_collective:
+            return False
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        return True if k_steps >= 0 else False
+
+    def _get_distributed_strategy(self):
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        strategy = None
+
+        if not self.user_defined_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_sync_strategy()
+
+        if self.user_defined_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_async_strategy()
+
+        if self.user_defined_strategy.a_sync and k_steps > 0:
+            strategy = StrategyFactory.create_geo_strategy(k_steps)
+
+        if not strategy:
+            raise ValueError("k_steps must be invalid value, please check")
+
+        return strategy
+
+    def _build_trainer_programs(self, compiled_config):
+        from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker
+
+        _main = compiled_config.origin_main_program.clone()
+        _startup = compiled_config.origin_startup_program.clone()
+
+        if not compiled_config.is_geo_mode():
+            # for main program
+            _main = worker.delete_optimizer_pass(_main, compiled_config)
+            _main = worker.distributed_ops_pass(_main, compiled_config)
+            _main = worker.append_send_ops_pass(_main, compiled_config)
+
+            # for startup program
+            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
+            _startup = worker.init_from_server_pass(_startup, compiled_config)
+            _startup = worker.delet_extra_optimizes_pass(_startup,
+                                                         compiled_config)
+        else:
+            _main = worker.append_send_ops_pass(_main, compiled_config)
+            _startup = _startup
+
+        return _main, _startup
+
+    def _build_pserver_programs(self, compiled_config):
+        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
+
+        _main = fluid.Program()
+        _startup = fluid.Program()
+
+        if not compiled_config.is_geo_mode():
+            _main = server.add_listen_and_serv_pass(_main, compiled_config)
+            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
+            _main = server.add_optimizer_pass(_main, compiled_config)
+            _main = server.large_scale_sparse_pass(_main, _main,
+                                                   compiled_config, False)
+            _startup = server.build_pserver_startup_program_pass(
+                _startup, _main, compiled_config)
+            _startup = server.large_scale_sparse_pass(_startup, _main,
+                                                      compiled_config, True)
+
+            if not compiled_config.is_sync_mode():
+                _main = server.delete_unused_in_main_pass(_main,
+                                                          compiled_config)
+
+            _startup = server.delete_unused_in_startup_pass(_startup, _main,
+                                                            compiled_config)
+        else:
+            _main = server.add_listen_and_serv_pass(_main, compiled_config)
+            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
+            _main = server.add_geo_optimizer_pass(_main, compiled_config)
+            _main = server.large_scale_sparse_pass(_main, _main,
+                                                   compiled_config, False)
+            _startup = server.build_pserver_startup_program_pass(
+                _startup, _main, compiled_config)
+            _startup = server.large_scale_sparse_pass(_startup, _main,
+                                                      compiled_config, True)
+            _startup = server.delete_unused_in_startup_pass(_startup, _main,
+                                                            compiled_config)
+
+        return _main, _startup
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.inner_opt.minimize(loss, startup_program, parameter_list,
+                                no_grad_set)
+        strategy = self._get_distributed_strategy()
+
+        _origin_main_program = loss.block.program
+        _origin_startup_program = startup_program
+        from paddle.fluid.incubate.fleet.parameter_server.ir import public as public
+
+        compiled_config = public.CompileTimeStrategy(_origin_main_program,
+                                                     _origin_startup_program,
+                                                     strategy, self.role_maker)
+
+        main_program, startup_program = \
+            self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \
+                else self._build_pserver_programs(compiled_config)
+
+        loss.block.program = main_program
+        fluid.framework.switch_startup_program(startup_program)
+
+        return None, None
+
+    def _disable_strategy(self, dist_strategy):
+        self.user_defined_strategy.a_sync_configs = {}
diff --git a/python/paddle/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
similarity index 100%
rename from python/paddle/fleet/meta_optimizers/common.py
rename to python/paddle/distributed/fleet/meta_optimizers/common.py
diff --git a/python/paddle/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
similarity index 95%
rename from python/paddle/fleet/meta_optimizers/dgc_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index c9a28fdaf11dd0d4d45cfd3fb1904b80dc136711..f34786f9dc309dd1f03319368bbc93ef1bfc03e3 100644
--- a/python/paddle/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -25,6 +25,7 @@ class DGCOptimizer(MetaOptimizerBase):
         self.dgc_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -68,11 +69,7 @@ class DGCOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.dgc = False
-        dist_strategy.dgc_configs = {
-            'rampup_begin_step': 0,
-            'rampup_step': 1,
-            'sparsity': [0.999]
-        }
+        dist_strategy.dgc_configs = {}
 
     def backward(self,
                  loss,
diff --git a/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
similarity index 88%
rename from python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 668cf605defaf5eb3f4e205c5a18548e45449a9c..bd52179a35862241768ad5bd01eedf16732ad3b6 100644
--- a/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -16,13 +16,20 @@ from .meta_optimizer_base import MetaOptimizerBase
 
 __all__ = ["GradientMergeOptimizer"]
 
+# amp + gradient merge + lamb
+
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(GradientMergeOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.wrapped_opt = GM(optimizer)
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "GraphExecutionOptimizer",
+        ]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -40,7 +47,7 @@ class GradientMergeOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.gradient_merge = False
-        dist_strategy.gradient_merge_configs = {"k_steps": 1, "avg": True}
+        dist_strategy.gradient_merge_configs = {}
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
similarity index 93%
rename from python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 0473f7c1d689fb9cc2fc856a41076d0ab68baf0d..ace31687338f918ef260b3134b0bd429795542d0 100644
--- a/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -25,6 +25,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _is_graph_out(self):
         return True
@@ -119,18 +120,26 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         local_build_strategy.nccl_comm_num = \
                     dist_strategy.nccl_comm_num
 
+        if self.user_defined_strategy.recompute == True:
+            logging.warn(
+                "set enable_sequential_execution=True since you have enable the recompute strategy"
+            )
+            local_build_strategy.enable_sequential_execution = True
+
         exe_strategy = self.user_defined_strategy.execution_strategy
-        node_num = self.role_maker.worker_num()
+        worker_num = self.role_maker.worker_num()
+        node_num = self.role_maker.node_num()
 
         if self.role_maker._is_collective:
-            assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
+            assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
 
-        if node_num <= 1:
+        if worker_num <= 1:
             # local mode
             if local_build_strategy.nccl_comm_num > 1:
                 logging.warn("set nccl_comm_num=1 since you only have 1 node.")
             local_build_strategy.nccl_comm_num = 1
 
+        if node_num <= 1:
             if local_build_strategy.use_hierarchical_allreduce:
                 logging.warn(
                     "set hierachical_allreduce=False since you only have 1 node."
@@ -190,7 +199,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
                  parameter_list=None,
                  no_grad_set=None):
         if startup_program == None:
-            startup_program = paddle.default_startup_program()
+            startup_program = paddle.static.default_startup_program()
         compiled_program = self._try_to_compile(startup_program,
                                                 loss.block.program, loss)
         loss.block.program._graph = compiled_program
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..7e08a02eb1dc2e14b1871fe7743bbee8ade3feb3
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.optimizer import LambOptimizer as LAMB
+from .meta_optimizer_base import MetaOptimizerBase
+import logging
+
+__all__ = ["LambOptimizer"]
+
+
+class LambOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(LambOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.lamb_opt = None
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = []
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(LambOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+
+        opt = self.inner_opt
+        if not isinstance(opt, AdamOptimizer):
+            return
+
+        configs = self.user_defined_strategy.lamb_configs
+        if len(configs['exclude_from_weight_decay']) == 0:
+            _exclude_from_weight_decay_fn = None
+        else:
+
+            def exclude_fn(param):
+                exclude_list = configs['exclude_from_weight_decay']
+                for name in exclude_list:
+                    if param.name.endswith(name):
+                        return True
+                return False
+
+            _exclude_from_weight_decay_fn = exclude_fn
+
+        self.lamb_opt = LAMB(
+            learning_rate=opt._learning_rate,
+            lamb_weight_decay=configs['lamb_weight_decay'],
+            beta1=opt._beta1,
+            beta2=opt._beta2,
+            epsilon=opt._epsilon,
+            parameter_list=opt._parameter_list,
+            regularization=opt.regularization,
+            grad_clip=opt._grad_clip,
+            exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn,
+            name=opt._name)
+
+    def _can_apply(self):
+        if self.user_defined_strategy.lamb:
+            if not isinstance(self.inner_opt, AdamOptimizer):
+                logging.warn(
+                    "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".
+                    format(self.inner_opt.type))
+                return False
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.lamb = False
+        dist_strategy.lamb_configs = {}
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self.lamb_opt.backward(loss, startup_program, parameter_list,
+                                      no_grad_set, callbacks)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        optimize_ops, params_grads = \
+            self.lamb_opt.minimize(loss, startup_program,
+                                      parameter_list, no_grad_set)
+        return optimize_ops, params_grads
diff --git a/python/paddle/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
similarity index 94%
rename from python/paddle/fleet/meta_optimizers/lars_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index ff535e3ebf259cf646cb9649ee45acc409a8d0d7..09c418fa79106d05cffae1e8bc18fac9c0cc8f34 100755
--- a/python/paddle/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -24,7 +24,8 @@ class LarsOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.lars_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -58,10 +59,7 @@ class LarsOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.lars = False
-        dist_strategy.lars_configs = {
-            'lars_coeff': 0.001,
-            'lars_weight_decay': 0.0005,
-        }
+        dist_strategy.lars_configs = {}
 
     def backward(self,
                  loss,
diff --git a/python/paddle/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
similarity index 67%
rename from python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 05a120f8163755ad0effeccfe729f88782cfeebe..e22127c13999bfde7aa753ad1a66536913ab04f9 100644
--- a/python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from paddle.fluid import program_guard, layers
+from paddle.fluid import program_guard, layers, default_main_program
 from paddle.fluid.optimizer import Momentum, SGD
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
@@ -25,6 +25,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer"]
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
@@ -39,11 +40,35 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.localsgd = False
-        dist_strategy.localsgd_configs = {'k_steps': 1}
+        dist_strategy.localsgd_configs = {}
 
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
 
+    def create_snapshot_vars(self, program):
+        block = program.global_block()
+
+        non_dist_params = []
+        for param in block.iter_parameters():
+            if not param.is_distributed:
+                non_dist_params.append(param)
+
+        p2s = []
+        for param in non_dist_params:
+            snapshot = block.create_var(
+                name=self.snapshot_name(param.name),
+                shape=param.shape,
+                persistable=True,
+                stop_gradient=True,
+                dtype=param.dtype)
+            p2s.append([param, snapshot])
+        return p2s
+
+    def init_snapshot_vars(self, startup_program, param2snapshot):
+        with program_guard(startup_program):
+            for param, snapshot in param2snapshot:
+                layers.assign(param, snapshot)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -62,8 +87,11 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         self.nrings = 2
         collective_helper = CollectiveHelper(self.role_maker, self.nrings)
         collective_helper.update_startup_program(startup_program)
+        p2s = self.create_snapshot_vars(startup_program)
+        self.init_snapshot_vars(startup_program, p2s)
 
-        with program_guard(main_block.program):
+        p2s = self.create_snapshot_vars(main_block.program)
+        with program_guard(main_block.program, startup_program):
             step = layers.autoincreased_step_counter(begin=0)
             k_steps = layers.create_global_var(
                 name="k_steps",
@@ -79,6 +107,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                 persistable=True)
 
             if auto_steps:
+                avg_loss = layers.collective._c_allreduce(
+                    loss) / self.role_maker.worker_num()
+
                 lr_0 = layers.create_global_var(
                     name="lr_0",
                     shape=[1],
@@ -101,49 +132,32 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                 layers.cond(step == 0, initialize)
 
             def communicate():
-                ordered_param_snapshot = []
+                sub_block = default_main_program().current_block()
                 ring_id = -1
-                for idx, op in reversed(list(enumerate(main_block.ops))):
-                    if is_update_op(op):
-                        param = main_block.vars[op.input('Param')[0]]
-                        if param.is_distributed:
-                            continue
-
-                        snapshot = main_block.create_var(
-                            name=self.snapshot_name(param.name),
-                            shape=param.shape,
-                            persistable=True,
-                            stop_gradient=True,
-                            dtype=param.dtype)
-
-                        main_block._insert_op(
-                            idx + 1,
-                            type='elementwise_sub',
-                            inputs={'X': [snapshot],
-                                    'Y': [param]},
-                            outputs={'Out': [param]},
-                            attrs={OP_ROLE_KEY: OpRole.Optimize})
-                        main_block._insert_op(
-                            idx + 2,
-                            type='c_sync_calc_stream',
-                            inputs={'X': param},
-                            outputs={'Out': param},
-                            attrs={OP_ROLE_KEY: OpRole.Optimize})
-                        ring_id = (ring_id + 1) % self.nrings
-                        main_block._insert_op(
-                            idx + 3,
-                            type='c_allreduce_sum',
-                            inputs={'X': [param]},
-                            outputs={'Out': [param]},
-                            attrs={
-                                'ring_id': ring_id,
-                                OP_ROLE_KEY: OpRole.Optimize
-                            })
-
-                        ordered_param_snapshot.append((param, snapshot))
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='c_sync_calc_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    ring_id = (ring_id + 1) % self.nrings
+                    sub_block.append_op(
+                        type='c_allreduce_sum',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
 
                 for ring_id in range(self.nrings):
-                    main_block.append_op(
+                    sub_block.append_op(
                         type='c_sync_comm_stream',
                         inputs={'X': param},
                         outputs={'Out': param},
@@ -152,10 +166,8 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                             OP_ROLE_KEY: OpRole.Optimize
                         })
 
-                for param_snapshot in reversed(ordered_param_snapshot):
-                    param = param_snapshot[0]
-                    snapshot = param_snapshot[1]
-                    main_block.append_op(
+                for param, snapshot in p2s:
+                    sub_block.append_op(
                         type='scale',
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
@@ -163,13 +175,13 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                             'scale': 1.0 / self.role_maker.worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
-                    main_block.append_op(
+                    sub_block.append_op(
                         type='elementwise_sub',
                         inputs={'X': [snapshot],
                                 'Y': [param]},
                         outputs={'Out': [param]},
                         attrs={OP_ROLE_KEY: OpRole.Optimize})
-                    main_block.append_op(
+                    sub_block.append_op(
                         type='assign',
                         inputs={'X': [param]},
                         outputs={'Out': [snapshot]},
diff --git a/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
similarity index 58%
rename from python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
rename to python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 9ba184fb0089589a86d6444d12cf402b9687b041..12a4d904340337bf9a99968c7d82db117bf59ce8 100644
--- a/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -14,10 +14,16 @@
 
 __all__ = ["MetaOptimizerBase"]
 
+from paddle.fluid.optimizer import Optimizer
 
-class MetaOptimizerBase(object):
+
+class MetaOptimizerBase(Optimizer):
     def __init__(self, optimizer):
-        pass
+        self.inner_opt = optimizer
+        self._learning_rate = self.inner_opt._learning_rate
+        self._learning_rate_map = self.inner_opt._learning_rate_map
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -26,7 +32,7 @@ class MetaOptimizerBase(object):
         self.user_defined_optimizer = user_defined_optimizer
         self.user_defined_strategy = user_defined_strategy
 
-    def _update_inner_optimier(self, optimizer):
+    def _update_inner_optimizer(self, optimizer):
         self.inner_opt = optimizer
 
     def _can_apply(self):
@@ -38,17 +44,43 @@ class MetaOptimizerBase(object):
     def _can_update(self, optimizer):
         if str(optimizer.__class__.__name__) in self.meta_optimizers_white_list:
             return True
+        return False
 
     def _disable_strategy(self, dist_strategy):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
+    def apply_gradients(self, params_grads):
+        return self.inner_opt.apply_gradients(params_grads=params_grads)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self.inner_opt.backward(loss, startup_program, parameter_list,
+                                       no_grad_set, callbacks)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.inner_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        raise NotImplementedError("meta optimizer not implemented")
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
+        return optimize_ops, params_grads
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9221307cbacfa1beaf030b70a4e4b9223769cc
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -0,0 +1,225 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from ..base.private_helper_function import wait_server_ready
+from paddle.fluid.optimizer import PipelineOptimizer as PO
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+__all__ = ["PipelineOptimizer"]
+
+
+class PipelineHelper(CollectiveHelper):
+    def __init__(self, role_maker, nrings=1, wait_port='6174'):
+        super(PipelineHelper, self).__init__(role_maker, nrings, wait_port)
+
+    def _init_communicator(self, program, current_endpoint, endpoints, rank,
+                           ring_id, wait_port):
+        nranks = len(endpoints)
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            wait_server_ready(other_endpoints)
+
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=unique_name.generate('nccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+                OP_ROLE_KEY: OpRole.Forward
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': ring_id,
+                OP_ROLE_KEY: OpRole.Forward,
+                'device_id': OpRole.Forward
+            })
+
+    def _broadcast_params(self):
+        block = self.startup_program.global_block()
+        ring_id = 0
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        for ring_id in range(self.nrings):
+            block.append_op(
+                type='c_sync_comm_stream',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={'ring_id': ring_id,
+                       OP_ROLE_KEY: OpRole.Forward})
+
+
+class PipelineOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(PipelineOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(PipelineOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        num_microbatches = user_defined_strategy.pipeline_configs['micro_batch']
+        self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
+
+    def _can_apply(self):
+        if self.user_defined_strategy.pipeline == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.pipeline = False
+        dist_strategy.pipeline_configs = {}
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        optimize_ops, params_grads, prog_list = \
+            self.wrapped_opt.minimize(loss, startup_program,
+                                      parameter_list, no_grad_set)
+        if self.role_maker.worker_num() == 1:
+            return optimize_ops, params_grads
+
+        endpoints = self.role_maker.get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker.worker_index()]
+        self.startup_program = startup_program
+        if startup_program is None:
+            self.startup_program = fluid.default_startup_program()
+
+        assert prog_list
+        self.main_program_list = prog_list
+        self.main_program = loss.block.program
+        nranks = len(endpoints)
+        self.nranks = nranks
+        self.nrings = len(self.main_program_list)
+
+        self.rank = self.role_maker.worker_index()
+        self.endpoints = endpoints
+        self.current_endpoint = current_endpoint
+
+        pipeline_helper = PipelineHelper(self.role_maker, nrings=self.nrings)
+        pipeline_helper.update_startup_program(self.startup_program)
+
+        self._transpile_main_program()
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self):
+        self._insert_loss_grad_ops()
+        for ring_id in range(self.nrings):
+            self._insert_allreduce_ops(ring_id)
+
+    def _insert_loss_grad_ops(self):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = self.main_program_list[self.nrings - 1]['program'].global_block(
+        )
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / self.nranks,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+
+    def _insert_allreduce_ops(self, ring_id):
+        block = self.main_program_list[ring_id]['program'].global_block()
+        origin_block = self.main_program.global_block()
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and \
+                OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    origin_param = origin_block.vars[op_role_var[i]]
+                    if origin_param.is_distributed:
+                        continue
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        offset += 1
+
+                    block._insert_op(
+                        offset,
+                        type='c_sync_calc_stream',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx + ring_id,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+            break
diff --git a/python/paddle/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
similarity index 85%
rename from python/paddle/fleet/meta_optimizers/recompute_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 73119d81094ac611c0d3545b59342b5dbd8b5d16..45130b447125f6ecbade2e4e5e3dad2f127fda52 100644
--- a/python/paddle/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -24,13 +24,20 @@ class RecomputeOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.wrapped_opt = RO(optimizer)
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+        ]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(RecomputeOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        self.wrapped_opt._set_checkpoints([])
+        self.wrapped_opt._set_checkpoints(
+            list(user_defined_strategy.recompute_configs["checkpoints"]))
 
     def _can_apply(self):
         if self.user_defined_strategy.recompute == True:
@@ -42,7 +49,7 @@ class RecomputeOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.recompute = False
-        dist_strategy.recompute_configs = {"checkpoints": []}
+        dist_strategy.recompute_configs = {}
 
     def backward(self,
                  loss,
diff --git a/python/paddle/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
similarity index 100%
rename from python/paddle/fleet/metrics/__init__.py
rename to python/paddle/distributed/fleet/metrics/__init__.py
diff --git a/python/paddle/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
similarity index 95%
rename from python/paddle/fleet/metrics/metric.py
rename to python/paddle/distributed/fleet/metrics/metric.py
index 152ee21c147b01e549257bf8821c5c656ee81d0d..12a24292e5a3ad9ea838d9451fdf72e7e846a528 100644
--- a/python/paddle/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -43,7 +43,7 @@ def sum(input, scope=None):
           
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("sum array: ", paddle.fleet.sum(res))
+          print("sum array: ", paddle.distributed.fleet.sum(res))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -82,7 +82,7 @@ def max(input, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("max array: ", paddle.fleet.max(res))
+          print("max array: ", paddle.distributed.fleet.max(res))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -121,7 +121,7 @@ def min(input, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("min array: ", paddle.fleet.min(res))
+          print("min array: ", paddle.distributed.fleet.min(res))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -162,7 +162,7 @@ def auc(stat_pos, stat_neg, scope=None):
           # in train.py, after train or infer
           pos = np.array(scope.find_var(stat_pos.name).get_tensor())
           neg = np.array(scope.find_var(stat_neg.name).get_tensor())
-          print("auc: ", paddle.fleet.auc(pos, neg))
+          print("auc: ", paddle.distributed.fleet.auc(pos, neg))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -240,7 +240,7 @@ def mae(abserr, total_ins_num, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(abserr.name).get_tensor())
-          print("mae: ", paddle.fleet.mae(res, total_ins_num))
+          print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -278,7 +278,7 @@ def rmse(sqrerr, total_ins_num, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(sqrerr.name).get_tensor())
-          print("rmse: ", paddle.fleet.rmse(res, total_ins_num))
+          print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -316,7 +316,7 @@ def mse(sqrerr, total_ins_num, scope=None):
 
           # in train.py, after train or infer
           metric = np.array(scope.find_var(sqrerr.name).get_tensor())
-          print("mse: ", paddle.fleet.mse(metric, total_ins_num))
+          print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -365,7 +365,7 @@ def acc(correct, total, scope=None):
           # in train.py, after train or infer
           correct_num = np.array(scope.find_var(correct.name).get_tensor())
           total_num = np.array(scope.find_var(total.name).get_tensor())
-          print("accuracy: ", paddle.fleet.acc(correct_num, total_num))
+          print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a796a73fc981b7edbcd57e8f5858456031e7ae6e
--- /dev/null
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .collective_runtime import CollectiveRuntime
+from .parameter_server_runtime import ParameterServerRuntime
+
+__all__ = ["CollectiveRuntime," "ParameterServerRuntime", ]
diff --git a/python/paddle/fleet/runtime/collective_runtime.py b/python/paddle/distributed/fleet/runtime/collective_runtime.py
similarity index 97%
rename from python/paddle/fleet/runtime/collective_runtime.py
rename to python/paddle/distributed/fleet/runtime/collective_runtime.py
index 0881c4b52c822908cedc94d3f4de088eed6c65e8..c56cf4c7aa2ed86f4529b1bb09d51ce64d86cfc8 100644
--- a/python/paddle/fleet/runtime/collective_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/collective_runtime.py
@@ -30,7 +30,7 @@ class CollectiveRuntime(RuntimeBase):
             "You should not call 'run_worker' method for collective mode.")
         pass
 
-    def _init_server(self):
+    def _init_server(self, *args, **kwargs):
         logging.warn(
             "You should not call 'init_server' method for collective mode.")
         pass
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c731ed08893348d0be604eb383905cd4a9d6e228
--- /dev/null
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -0,0 +1,555 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.framework import Program
+from paddle.fluid.compiler import CompiledProgram
+from paddle.fluid.executor import Executor
+from paddle.fluid.parallel_executor import ParallelExecutor
+
+from .runtime_base import RuntimeBase
+
+
+class ParameterServerRuntime(RuntimeBase):
+    def __init__(self):
+        super(ParameterServerRuntime, self).__init__()
+        self._communicator = None
+
+    def _set_basic_info(self, context):
+        self.context = context
+        self.role_maker = context["role_maker"]
+        self.origin_main_program = context["origin_main_program"]
+        self.origin_startup_program = context["origin_startup_program"]
+        self.async_strategy = self._get_distributed_strategy()
+        self.compiled_strategy = self.build_compiled_startegy()
+
+    def _get_distributed_strategy(self):
+        strategy = None
+
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+        dist_strategy = self.context["valid_strategy"]
+        k_steps = dist_strategy.a_sync_configs["k_steps"]
+
+        if not dist_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_sync_strategy()
+
+        if dist_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_async_strategy()
+
+        if dist_strategy.a_sync and k_steps > 0:
+            strategy = StrategyFactory.create_geo_strategy(k_steps)
+
+        if not strategy:
+            raise ValueError("k_steps must be invalid value, please check")
+
+        return strategy
+
+    def build_compiled_startegy(self):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy
+
+        compiled_config = CompileTimeStrategy(
+            self.origin_main_program, self.origin_main_program,
+            self.async_strategy, self.role_maker)
+        return compiled_config
+
+    def _load_sparse_params(self, dirname, varnames):
+        from paddle.fluid.communicator import LargeScaleKV
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
+
+        scale_kv = LargeScaleKV()
+        for varname in varnames:
+            origin_varname, _, _ = _get_varname_parts(varname)
+            sparse_dir = os.path.join(dirname, origin_varname, varname)
+            scale_kv.load(varname, sparse_dir)
+
+    @staticmethod
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
+
+            origin_varname, _, _ = _get_varname_parts(var.name)
+            if origin_varname.endswith("@GRAD"):
+                return False
+
+            if origin_varname == "learning_rate_0":
+                return False
+
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                            var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    def _init_worker(self):
+        def sync_strategy_envs():
+            kwargs = {}
+            kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
+            )
+            kwargs["trainer_id"] = self.role_maker.worker_index()
+            return kwargs
+
+        def geo_strategy_envs():
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
+
+            def get_sparse_attrs():
+                opt_init_map = {}
+                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
+                opt_init_map["fill_constant"] = ["value"]
+                opt_init_map["uniform_random"] = ["seed", "min", "max"]
+                opt_init_map[
+                    "truncated_gaussian_random"] = ["seed", "mean", "std"]
+
+                dist_varnames = get_sparse_tablenames(self.origin_main_program,
+                                                      True)
+                sparse_varnames = get_sparse_tablenames(
+                    self.origin_main_program, False)
+
+                if len(dist_varnames) != 0:
+                    raise ValueError(
+                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
+                    )
+
+                init_attrs = []
+                for value_name in sparse_varnames:
+                    value_var = self.origin_main_program.global_block().vars[
+                        value_name]
+                    value_attr = [
+                        value_name,
+                        ",".join([str(dim) for dim in value_var.shape])
+                    ]
+                    for op in self.origin_startup_program.global_block().ops:
+                        if op.type in opt_init_map.keys(
+                        ) and value_name == op.output("Out")[0]:
+                            init_attr = [op.type]
+                            for attr in opt_init_map[op.type]:
+                                init_attr.append(str(op.attr(attr)))
+                            value_attr.append("&".join(init_attr))
+                            init_attrs.append(":".join(value_attr))
+                            break
+                return "#".join(init_attrs)
+
+            kwargs = {}
+            kwargs["trainers"] = self.role_maker.worker_num()
+            kwargs["sparse_attrs"] = get_sparse_attrs()
+            return kwargs
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
+            SyncStrategy, GeoStrategy
+
+        trainer_config = self.async_strategy.get_trainer_runtime_config()
+        lrs = _get_lr_ops(self.origin_main_program)
+
+        if len(lrs) > 0:
+            kwargs = {"need_global_step": "1"}
+        else:
+            kwargs = {"need_global_step": "0"}
+
+        if isinstance(self.async_strategy, GeoStrategy):
+            geo_kwargs = geo_strategy_envs()
+            kwargs.update(geo_kwargs)
+        if isinstance(self.async_strategy, SyncStrategy):
+            sync_kwargs = sync_strategy_envs()
+            kwargs.update(sync_kwargs)
+
+        kwargs = kwargs if kwargs else None
+
+        send_ctx = self.compiled_strategy.get_communicator_send_context()
+
+        if self.compiled_strategy.is_geo_mode():
+            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
+                recv_type=4)
+        else:
+            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
+                recv_type=1)
+
+        from paddle.fluid.communicator import Communicator
+        self._communicator = Communicator(
+            trainer_config.mode, kwargs,
+            trainer_config.get_communicator_flags())
+        self._communicator.init_with_ctx(send_ctx, recv_ctx)
+
+        if not self._communicator.is_running():
+            self._communicator.start()
+        else:
+            warnings.warn("communicator has been initialized, skip")
+
+    def _init_server(self, *args, **kwargs):
+        if len(args) > 1:
+            raise ValueError("init server can only accept 1 args: `dirname`")
+        elif len(args) == 1:
+            model_dirname = args[0]
+        else:
+            model_dirname = None
+
+        executor = fluid.Executor(fluid.CPUPlace())
+        executor.run(fluid.default_startup_program())
+
+        if not model_dirname:
+            return
+
+        if not os.path.isdir(model_dirname):
+            raise ValueError("There is no directory named '%s'", model_dirname)
+
+        sparse_varnames = self.compiled_strategy.get_sparse_varname_on_ps(True)
+
+        distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
+            False)
+
+        remaining_vars = list(
+            filter(
+                ParameterServerRuntime.__exclude_vars(sparse_varnames +
+                                                      distribtued_varnames),
+                fluid.default_main_program().list_vars()))
+
+        fluid.io.load_vars(
+            executor,
+            main_program=fluid.default_main_program(),
+            dirname=model_dirname,
+            vars=remaining_vars)
+
+        self._load_sparse_params(
+            dirname=model_dirname, varnames=sparse_varnames)
+
+        # todo(tangwei12) load distributed vars
+        # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
+
+    def _run_server(self):
+        executor = fluid.Executor(fluid.CPUPlace())
+        executor.run(fluid.default_main_program())
+
+    def _stop_worker(self):
+        self._communicator.stop()
+        executor = fluid.Executor(fluid.CPUPlace())
+        executor.close()
+
+    def _get_optimizer_status(self, op, param_name):
+        supported_opts = [
+            "sgd", "adam", "adagrad", "adamax", "momentum", "lars_momentum",
+            "rmsprop", "decayed_adagrad", "ftrl"
+        ]
+
+        reshaped_val_map = {}
+        reshaped_val_map["sgd"] = []
+        reshaped_val_map["adam"] = ["moment1_0", "moment2_0"]
+        reshaped_val_map["adagrad"] = ["moment_0"]
+        reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
+        reshaped_val_map["momentum"] = ["velocity_0"]
+        reshaped_val_map["lars_momentum"] = ["velocity_0"]
+        reshaped_val_map[
+            "rmsprop"] = ["momentum_0", "mean_square_0", "mean_grad_0"]
+        reshaped_val_map["decayed_adagrad"] = ["moment_0"]
+        reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]
+
+        orishaped_val_map = {}
+        orishaped_val_map["adam"] = ["beta1_pow_acc_0", "beta2_pow_acc_0"]
+        orishaped_val_map["adamax"] = ["beta1_pow_acc_0"]
+
+        if op not in supported_opts:
+            raise ValueError(
+                "fleet can not support optimizer: {}, only this can be supported: {}".
+                format(op, supported_opts))
+
+        reshaped_names = [
+            param_name + "_" + val for val in reshaped_val_map[op]
+        ]
+
+        if op not in orishaped_val_map:
+            origin_names = []
+        else:
+            origin_names = [
+                param_name + "_" + val for val in orishaped_val_map[op]
+            ]
+        return reshaped_names, origin_names
+
+    def _get_optimizer_op(self, param_name):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
+
+        opts = _get_optimize_ops(self.origin_main_program)
+        for op in opts:
+            if "Param" in op.input_names and \
+                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                return op
+
+    def _save_dense_params(self, executor, dirname, context, main_program):
+        self._communicator.recv()
+
+        prog = Program()
+        block = prog.global_block()
+        local_vars = []
+
+        for name, var_ctx in context.items():
+            if len(var_ctx.origin_varnames()) != 1:
+                raise ValueError("Dense can not support split now.")
+
+            varname = var_ctx.origin_varnames()[0]
+            local_vars.append(varname)
+
+            optimizer = self._get_optimizer_op(varname)
+            reshaped_varnames, origin_varnames = self._get_optimizer_status(
+                optimizer.type, varname)
+
+            for var_name in [varname] + reshaped_varnames + origin_varnames:
+                var = self.origin_main_program.global_block().vars[var_name]
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes":
+                        [",".join([str(i) for i in var.shape])],
+                        "slice_varnames": [var.name],
+                        "remote_varnames": [var.name],
+                        "is_sparse": False,
+                        "endpoints": var_ctx.split_endpoints(),
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+
+        executor.run(prog)
+        return local_vars
+
+    def _save_sparse_params(self, executor, dirname, context, main_program):
+        prog = Program()
+        block = prog.global_block()
+        local_vars = []
+
+        for name, var_ctx in context.items():
+            if len(var_ctx.origin_varnames()) != 1:
+                raise ValueError("Dense can not support split now.")
+
+            varname = var_ctx.origin_varnames()[0]
+            local_vars.append(varname)
+
+            optimizer = self._get_optimizer_op(varname)
+            reshaped_varnames, origin_varnames = self._get_optimizer_status(
+                optimizer.type, varname)
+
+            var = self.origin_main_program.global_block().vars[varname]
+            slice_shapes = []
+            dims1 = ",".join([str(i) for i in var.shape[1:]])
+
+            for section in var_ctx.sections():
+                slice_shapes.append(str(section) + dims1)
+
+            block.append_op(
+                type='recv_save',
+                attrs={
+                    "trainer_id": self.role_maker.worker_index(),
+                    "shape": var.shape,
+                    "slice_shapes": slice_shapes,
+                    "slice_varnames": var_ctx.split_varnames(),
+                    "remote_varnames": var_ctx.split_varnames(),
+                    "is_sparse": True,
+                    "endpoints": var_ctx.split_endpoints(),
+                    "pserver_num": len(self.role_maker.get_pserver_endpoints()),
+                    "file_path": os.path.join(dirname, var.name)
+                })
+
+            for reshaped_varname in reshaped_varnames:
+                var = self.origin_main_program.global_block().vars[
+                    reshaped_varname]
+
+                slice_varnames = []
+                remote_varnames = []
+                for i in range(len(var_ctx.split_varnames())):
+                    slice_varnames.append("{}.block{}".format(reshaped_varname,
+                                                              i))
+                    remote_varnames.append(reshaped_varname)
+
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes": slice_shapes,
+                        "slice_varnames": slice_varnames,
+                        "remote_varnames": remote_varnames,
+                        "is_sparse": True,
+                        "endpoints": var_ctx.split_endpoints(),
+                        "pserver_num":
+                        len(self.role_maker.get_pserver_endpoints()),
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+
+            for origin_varname in origin_varnames:
+                var = self.origin_main_program.global_block().vars[
+                    origin_varname]
+
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes":
+                        [",".join([str(i) for i in var.shape])],
+                        "slice_varnames": [origin_varname],
+                        "remote_varnames": [origin_varname],
+                        "is_sparse": False,
+                        "endpoints": var_ctx.split_endpoints()[:1],
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+        executor.run(prog)
+        return context.keys()
+
+    def _save_distributed_params(self, executor, dirname, context,
+                                 main_program):
+        prog = Program()
+        block = prog.global_block()
+
+        for name, var_ctx in context.items():
+            block.append_op(
+                type='checkpoint_notify',
+                attrs={
+                    "varname": name,
+                    "is_slice": True,
+                    "slice_varnames": var_ctx.split_varnames(),
+                    "remote_varnames": var_ctx.split_varnames(),
+                    "endpoints": var_ctx.split_endpoints(),
+                    "dirname": dirname
+                })
+
+        executor.run(prog)
+        return context.keys()
+
+    def _save_distributed_persistables(self, executor, dirname, main_program):
+        dense_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=1)
+
+        sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=2)
+
+        distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=3)
+
+        recv_dense_varnames = self._save_dense_params(executor, dirname,
+                                                      dense_ctx, main_program)
+
+        recv_sparse_varnames = self._save_sparse_params(
+            executor, dirname, sparse_ctx, main_program)
+
+        recv_distributed_varnames = self._save_distributed_params(
+            executor, dirname, distributed_ctx, main_program)
+
+        saved_varnames = recv_dense_varnames + list(
+            recv_sparse_varnames) + list(recv_distributed_varnames)
+
+        remaining_vars = list(
+            filter(
+                ParameterServerRuntime.__exclude_vars(saved_varnames),
+                main_program.list_vars()))
+
+        fluid.io.save_vars(
+            executor,
+            main_program=main_program,
+            dirname=dirname,
+            vars=remaining_vars)
+
+    def _ps_inference_save_persistables(self,
+                                        executor,
+                                        dirname,
+                                        main_program=None,
+                                        **kwargs):
+        """
+        This function filters out all variables with `persistable==True` from the
+        give `main_program` and then saves these variables to the folder `dirname`
+        or file `filename`.
+
+        The `dirname` is used to specify the folder where persistable variables
+        are going to be saved. If you would like to save variables in separate
+        files, set `filename` None; if you would like to save all variables in a
+        single file, use `filename` to specify the file name.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type"
+            )
+
+        if main_program is None:
+            main_program = fluid.default_main_program()
+
+        if isinstance(main_program, CompiledProgram):
+            raise TypeError(
+                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
+            )
+
+        self._save_distributed_persistables(executor, dirname, main_program)
+
+    def _ps_inference_save_inference_model(self,
+                                           executor,
+                                           dirname,
+                                           feeded_var_names,
+                                           target_vars,
+                                           main_program=None,
+                                           export_for_deployment=True):
+        """
+        Prune the given `main_program` to build a new program especially for inference,
+        and then save it and all related parameters to given `dirname` by the `executor`.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type"
+            )
+
+        if main_program is not None:
+            if isinstance(main_program, CompiledProgram):
+                raise TypeError(
+                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
+                )
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor, main_program,
+                                          None, None, export_for_deployment)
+        else:
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor,
+                                          self.origin_main_program, None, None,
+                                          export_for_deployment, True)
+
+            model_basename = "__model__"
+            model_filename = os.path.join(dirname, model_basename)
+
+            with open(model_filename, "rb") as f:
+                program_desc_str = f.read()
+
+            program = Program.parse_from_string(program_desc_str)
+            program._copy_dist_param_info_from(fluid.default_main_program())
+            self._ps_inference_save_persistables(executor, dirname, program)
+
+    def _save_inference_model(self, *args, **kwargs):
+        self._ps_inference_save_inference_model(*args, **kwargs)
+
+    def _save_persistables(self, *args, **kwargs):
+        self._ps_inference_save_persistables(*args, **kwargs)
diff --git a/python/paddle/fleet/runtime/runtime_base.py b/python/paddle/distributed/fleet/runtime/runtime_base.py
similarity index 83%
rename from python/paddle/fleet/runtime/runtime_base.py
rename to python/paddle/distributed/fleet/runtime/runtime_base.py
index c7ce8b5a2914bf30f346cbd0777d1d233ddf5e1b..2e8bacfbc3b1ded58e63e8d9e93764a0c0090b91 100644
--- a/python/paddle/fleet/runtime/runtime_base.py
+++ b/python/paddle/distributed/fleet/runtime/runtime_base.py
@@ -25,7 +25,7 @@ class RuntimeBase(object):
     def _run_worker(self):
         pass
 
-    def _init_server(self):
+    def _init_server(self, *args, **kwargs):
         pass
 
     def _run_server(self):
@@ -33,3 +33,9 @@ class RuntimeBase(object):
 
     def _stop_worker(self):
         pass
+
+    def _save_inference_model(self, *args, **kwargs):
+        pass
+
+    def _save_persistables(self, *args, **kwargs):
+        pass
diff --git a/python/paddle/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
similarity index 100%
rename from python/paddle/fleet/utils/__init__.py
rename to python/paddle/distributed/fleet/utils/__init__.py
diff --git a/python/paddle/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
similarity index 54%
rename from python/paddle/fleet/utils/fs.py
rename to python/paddle/distributed/fleet/utils/fs.py
index 3fec773f2731803cd9166ae0500dba68f4f0011b..2dbe5cefbb4944e219989358ebeb0c321f942551 100644
--- a/python/paddle/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -26,6 +26,7 @@ import logging
 import six
 import abc
 import paddle.fluid as fluid
+from paddle.fluid import core
 import functools
 
 from pathlib import PurePosixPath, Path
@@ -33,7 +34,7 @@ import shutil
 
 __all__ = [
     'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
-    'FSFileExistsError', 'FSFileNotExistsError'
+    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
 ]
 
 
@@ -53,6 +54,10 @@ class FSTimeOut(Exception):
     pass
 
 
+class FSShellCmdAborted(ExecuteError):
+    pass
+
+
 class FS(object):
     @abc.abstractmethod
     def ls_dir(self, fs_path):
@@ -95,7 +100,7 @@ class FS(object):
         raise NotImplementedError
 
     @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path):
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
         raise NotImplementedError
 
     @abc.abstractmethod
@@ -103,15 +108,11 @@ class FS(object):
         raise NotImplementedError
 
     @abc.abstractmethod
-    def glob(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def stat(self, fs_path):
+    def list_dirs(self, fs_path):
         raise NotImplementedError
 
     @abc.abstractmethod
-    def walk(self, fs_path):
+    def touch(self, fs_path, exist_ok=True):
         raise NotImplementedError
 
 
@@ -135,14 +136,8 @@ class LocalFS(FS):
             fs_path)
         os.system("mkdir -p {}".format(fs_path))
 
-    def is_file(self, fs_path):
-        return os.path.isfile(fs_path)
-
-    def is_dir(self, fs_path):
-        return os.path.isdir(fs_path)
-
-    def is_exist(self, fs_path):
-        return os.path.exists(fs_path)
+    def rename(self, fs_src_path, fs_dst_path):
+        os.rename(fs_src_path, fs_dst_path)
 
     def _rmr(self, fs_path):
         shutil.rmtree(fs_path)
@@ -159,24 +154,51 @@ class LocalFS(FS):
 
         return self._rmr(fs_path)
 
-    def rename(self, fs_src_path, fs_dst_path):
-        os.rename(fs_src_path, fs_dst_path)
-
     def need_upload_download(self):
         return False
 
-    def touch(self, fs_path):
-        return Path(fs_path).touch()
+    def is_file(self, fs_path):
+        return os.path.isfile(fs_path)
+
+    def is_dir(self, fs_path):
+        return os.path.isdir(fs_path)
+
+    def is_exist(self, fs_path):
+        return os.path.exists(fs_path)
+
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return Path(fs_path).touch(exist_ok=True)
 
-    def mv(self, src_path, dst_path):
+    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
 
+        if overwrite and self.is_exist(dst_path):
+            self.delete(dst_path)
+
         if self.is_exist(dst_path):
             raise FSFileExistsError
 
         return self.rename(src_path, dst_path)
 
+    def list_dirs(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        if not self.is_exist(fs_path):
+            return []
+
+        dirs = [
+            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
+        ]
+
+        return dirs
+
 
 """HDFS Utils."""
 
@@ -198,6 +220,41 @@ def _handle_errors(f):
     return functools.wraps(f)(handler)
 
 
+def _handle_errors(max_time_out=None):
+    def decorator(f):
+        @functools.wraps(f)
+        def handler(*args, **kwargs):
+            o = args[0]
+            time_out = max_time_out
+            if time_out is None:
+                time_out = float(o._time_out) / 1000.0
+            else:
+                time_out /= 1000.0
+            inter = float(o._sleep_inter) / 1000.0
+
+            start = time.time()
+            last_print_time = start
+            while True:
+                try:
+                    return f(*args, **kwargs)
+                #important: only ExecuteError need to retry
+                except ExecuteError as e:
+                    if time.time() - start >= time_out:
+                        raise FSTimeOut("args:{} timeout:{}".format(
+                            args, time.time() - start))
+
+                    time.sleep(inter)
+
+                if time.time() - last_print_time > 30:
+                    print("hadoop operator timeout:args:{} timeout:{}".format(
+                        args, time.time() - start))
+                    last_print_time = time.time()
+
+        return handler
+
+    return decorator
+
+
 class HDFSClient(FS):
     def __init__(
             self,
@@ -216,7 +273,8 @@ class HDFSClient(FS):
 
         if configs:
             for k, v in six.iteritems(configs):
-                self.pre_commands.append('-D%s=%s' % (k, v))
+                config_command = '-D%s=%s' % (k, v)
+                self.pre_commands.append(config_command)
 
         self._time_out = time_out
         self._sleep_inter = sleep_inter
@@ -225,10 +283,22 @@ class HDFSClient(FS):
             r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
 
     def _run_cmd(self, cmd, redirect_stderr=False):
-        ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
-        return int(ret), output.splitlines()
+        exe_cmd = "{} -{}".format(self._base_cmd, cmd)
+        ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr)
+        ret = int(ret)
+        if ret == 134:
+            raise FSShellCmdAborted(cmd)
+        return ret, output.splitlines()
+
+    @_handle_errors()
+    def list_dirs(self, fs_path):
+        if not self.is_exist(fs_path):
+            return []
 
-    @_handle_errors
+        dirs, files = self._ls_dir(fs_path)
+        return dirs
+
+    @_handle_errors()
     def ls_dir(self, fs_path):
         """	
         list directory under fs_path, and only give the pure name, not include the fs_path	
@@ -236,11 +306,14 @@ class HDFSClient(FS):
         if not self.is_exist(fs_path):
             return [], []
 
-        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        return self._ls_dir(fs_path)
+
+    def _ls_dir(self, fs_path):
+        cmd = "ls {}".format(fs_path)
         ret, lines = self._run_cmd(cmd)
 
         if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
         dirs = []
         files = []
@@ -249,9 +322,6 @@ class HDFSClient(FS):
             if len(arr) != 8:
                 continue
 
-            if fs_path not in arr[7]:
-                continue
-
             p = PurePosixPath(arr[7])
             if arr[0][0] == 'd':
                 dirs.append(p.name)
@@ -268,18 +338,20 @@ class HDFSClient(FS):
 
         return None
 
-    @_handle_errors
+    @_handle_errors()
     def is_dir(self, fs_path):
         if not self.is_exist(fs_path):
             return False
 
-        cmd = "{} -test -d {}".format(
-            self._base_cmd, fs_path, redirect_stderr=True)
+        return self._is_dir(fs_path)
+
+    def _is_dir(self, fs_path):
+        cmd = "test -d {}".format(fs_path, redirect_stderr=True)
         ret, lines = self._run_cmd(cmd)
         if ret:
             # other error
-            if self._test_match(lines) != None:
-                raise ExecuteError
+            if self._test_match(lines):
+                raise ExecuteError(cmd)
 
             return False
 
@@ -289,94 +361,155 @@ class HDFSClient(FS):
         if not self.is_exist(fs_path):
             return False
 
-        return not self.is_dir(fs_path)
+        return not self._is_dir(fs_path)
 
-    @_handle_errors
+    @_handle_errors()
     def is_exist(self, fs_path):
-        cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
+        cmd = "ls {} ".format(fs_path)
         ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
             for l in out:
                 if "No such file or directory" in l:
                     return False
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
         return True
 
-    @_handle_errors
+    # can't retry
     def upload(self, local_path, fs_path):
         if self.is_exist(fs_path):
-            raise FSFileExistsError
+            raise FSFileExistsError("{} exists".format(fs_path))
 
         local = LocalFS()
         if not local.is_exist(local_path):
-            raise FSFileNotExistsError
-
-        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
-        ret, lines = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
+            raise FSFileNotExistsError("{} not exists".format(local_path))
+
+        return self._try_upload(local_path, fs_path)
+
+    @_handle_errors()
+    def _try_upload(self, local_path, fs_path):
+        cmd = "put {} {}".format(local_path, fs_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            self.delete(fs_path)
+            raise e
+
+    # can't retry
     def download(self, fs_path, local_path):
         if self.is_exist(local_path):
-            raise FSFileExistsError
+            raise FSFileExistsError("{} exists".format(local_path))
 
         if not self.is_exist(fs_path):
-            raise FSFileNotExistsError
-
-        cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
-        ret, lines = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
+            raise FSFileNotExistsError("{} not exits".format(fs_path))
+
+        return self._try_download(fs_path, local_path)
+
+    @_handle_errors()
+    def _try_download(self, fs_path, local_path):
+        cmd = "get {} {}".format(fs_path, local_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            local_fs = LocalFS()
+            local_fs.delete(local_path)
+            raise e
+
+    @_handle_errors()
     def mkdirs(self, fs_path):
         if self.is_exist(fs_path):
             return
 
-        cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
-        ret, lines = self._run_cmd(cmd)
+        out_hdfs = False
+
+        cmd = "mkdir {} ".format(fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
-            raise ExecuteError
+            for l in out:
+                if "No such file or directory" in l:
+                    out_hdfs = True
+                    break
+            if not out_hdfs:
+                raise ExecuteError(cmd)
+
+        if out_hdfs and not self.is_exist(fs_path):
+            cmd = "mkdir -p {}".format(fs_path)
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        if overwrite and self.is_exist(fs_dst_path):
+            self.delete(fs_dst_path)
 
-    @_handle_errors
-    def mv(self, fs_src_path, fs_dst_path, test_exists=True):
         if test_exists:
             if not self.is_exist(fs_src_path):
-                raise FSFileNotExistsError
+                raise FSFileNotExistsError("{} is not exists".format(
+                    fs_src_path))
 
             if self.is_exist(fs_dst_path):
-                raise FSFileExistsError
+                raise FSFileExistsError("{} exists already".format(
+                    fs_src_path, fs_dst_path, fs_dst_path))
+
+        return self._try_mv(fs_src_path, fs_dst_path)
+
+    @_handle_errors()
+    def _try_mv(self, fs_src_path, fs_dst_path):
+        cmd = "mv {} {}".format(fs_src_path, fs_dst_path)
+        ret = 0
+        try:
+            ret, _ = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            if not self.is_exist(fs_src_path) and \
+                    self.is_exist(fs_dst_path):
+                return
+            raise e
 
-        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
-        ret, _ = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
     def _rmr(self, fs_path):
-        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        cmd = "rmr {}".format(fs_path)
         ret, _ = self._run_cmd(cmd)
         if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
-    @_handle_errors
     def _rm(self, fs_path):
-        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        cmd = "rm {}".format(fs_path)
         ret, _ = self._run_cmd(cmd)
         if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
+    @_handle_errors()
     def delete(self, fs_path):
         if not self.is_exist(fs_path):
             return
 
-        is_dir = self.is_dir(fs_path)
+        is_dir = self._is_dir(fs_path)
         if is_dir:
             return self._rmr(fs_path)
 
         return self._rm(fs_path)
 
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return self._touchz(fs_path)
+
+    @_handle_errors()
+    def _touchz(self, fs_path):
+        cmd = "touchz {}".format(fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
     def need_upload_download(self):
         return True
diff --git a/python/paddle/fleet/utils/http_server.py b/python/paddle/distributed/fleet/utils/http_server.py
similarity index 100%
rename from python/paddle/fleet/utils/http_server.py
rename to python/paddle/distributed/fleet/utils/http_server.py
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index fff10c5b2a9ee497cccff94346314db2c8011eb5..49e98805d24f3f8f5dc1cfcbf3ddc8d9fb835fde 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -18,3 +18,540 @@
 #            'Normal',
 #            'sampling_id',
 #            'Uniform']
+
+from __future__ import print_function
+
+from .fluid.layers import control_flow
+from .fluid.layers import tensor
+from .fluid.layers import ops
+from .fluid.layers import nn
+from .fluid import core
+from .fluid.framework import in_dygraph_mode
+from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
+import math
+import numpy as np
+import warnings
+
+from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+
+__all__ = ['Distribution', 'Uniform', 'Normal']
+
+
+class Distribution(object):
+    """
+    The abstract base class for probability distributions. Functions are 
+    implemented in specific distributions.
+    """
+
+    def __init__(self):
+        super(Distribution, self).__init__()
+
+    def sample(self):
+        """Sampling from the distribution."""
+        raise NotImplementedError
+
+    def entropy(self):
+        """The entropy of the distribution."""
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        """The KL-divergence between self distributions and other."""
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        """Log probability density/mass function."""
+        raise NotImplementedError
+
+    def probs(self, value):
+        """Probability density/mass function."""
+        raise NotImplementedError
+
+    def _validate_args(self, *args):
+        """
+        Argument validation for distribution args
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Raises
+            ValueError: if one argument is Tensor, all arguments should be Tensor
+        """
+        is_variable = False
+        is_number = False
+        for arg in args:
+            if isinstance(arg, tensor.Variable):
+                is_variable = True
+            else:
+                is_number = True
+
+        if is_variable and is_number:
+            raise ValueError(
+                'if one argument is Tensor, all arguments should be Tensor')
+
+        return is_variable
+
+    def _to_tensor(self, *args):
+        """
+        Argument convert args to Tensor
+
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Returns:
+            Tensor of args.
+        """
+        numpy_args = []
+        variable_args = []
+        tmp = 0.
+
+        for arg in args:
+            valid_arg = False
+            for cls in [float, list, np.ndarray, tensor.Variable]:
+                if isinstance(arg, cls):
+                    valid_arg = True
+                    break
+            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Tensor."
+            if isinstance(arg, float):
+                arg = np.zeros(1) + arg
+            arg_np = np.array(arg)
+            arg_dtype = arg_np.dtype
+            if str(arg_dtype) not in ['float32']:
+                warnings.warn(
+                    "data type of argument only support float32, your argument will be convert to float32."
+                )
+                arg_np = arg_np.astype('float32')
+            tmp = tmp + arg_np
+            numpy_args.append(arg_np)
+
+        dtype = tmp.dtype
+        for arg in numpy_args:
+            arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
+            arg_variable = tensor.create_tensor(dtype=dtype)
+            tensor.assign(arg_broadcasted, arg_variable)
+            variable_args.append(arg_variable)
+
+        return tuple(variable_args)
+
+
+class Uniform(Distribution):
+    """Uniform distribution with `low` and `high` parameters.
+
+    Mathematical Details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+
+    .. math::
+
+        Z = b - a
+
+    In the above equation:
+
+    * :math:`low = a`,
+    * :math:`high = b`,
+    * :math:`Z`: is the normalizing constant.
+
+    The parameters `low` and `high` must be shaped in a way that supports
+    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+
+    Args:
+        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          from paddle.distribution import Uniform
+
+          paddle.disable_static()
+          # Without broadcasting, a single uniform distribution [3, 4]:
+          u1 = Uniform(low=3.0, high=4.0)
+          # 2 distributions [1, 3], [2, 4]
+          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
+          # 4 distributions
+          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
+                    high=[[1.5, 2.5], [3.5, 4.5]])
+
+          # With broadcasting:
+          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
+
+          # Complete example
+          value_npdata = np.array([0.8], dtype="float32")
+          value_tensor = paddle.to_tensor(value_npdata)
+
+          uniform = Uniform([0.], [2.])
+
+          sample = uniform.sample([2])
+          # a random tensor created by uniform distribution with shape: [2, 1]
+          entropy = uniform.entropy()
+          # [0.6931472] with shape: [1]
+          lp = uniform.log_prob(value_tensor)
+          # [-0.6931472] with shape: [1]
+          p = uniform.probs(value_tensor)
+          # [0.5] with shape: [1]
+    """
+
+    def __init__(self, low, high, name=None):
+        if not in_dygraph_mode():
+            check_type(low, 'low',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Uniform')
+            check_type(high, 'high',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Uniform')
+
+        self.all_arg_is_float = False
+        self.batch_size_unknown = False
+        self.name = name if name is not None else 'Uniform'
+
+        if isinstance(low, int):
+            low = float(low)
+        if isinstance(high, int):
+            high = float(high)
+
+        if self._validate_args(low, high):
+            self.batch_size_unknown = True
+            self.low = low
+            self.high = high
+        else:
+            if isinstance(low, float) and isinstance(high, float):
+                self.all_arg_is_float = True
+            self.low, self.high = self._to_tensor(low, high)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        name = self.name + '_sample'
+        batch_shape = list((self.low + self.high).shape)
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
+            uniform_random_tmp = nn.uniform_random_batch_size_like(
+                zero_tmp,
+                zero_tmp.shape,
+                dtype=convert_dtype(zero_tmp.dtype),
+                min=0.,
+                max=1.,
+                seed=seed)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
+                                                    output_shape)
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low)
+            output = elementwise_add(output, self.low, name=name)
+            return output
+        else:
+            output_shape = shape + batch_shape
+            output = nn.uniform_random(
+                output_shape, seed=seed) * (tensor.zeros(
+                    output_shape, dtype=self.low.dtype) +
+                                            (self.high - self.low))
+            output = elementwise_add(output, self.low, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        name = self.name + '_log_prob'
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            dtype = value.dtype
+            lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                               dtype)
+            ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                               dtype)
+            return nn.log(lb * ub) - nn.log(self.high - self.low)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+
+        lb_bool = control_flow.less_than(self.low, value)
+        ub_bool = control_flow.less_than(value, self.high)
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_sub(
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        name = self.name + '_probs'
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            dtype = value.dtype
+            lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                               dtype)
+            ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                               dtype)
+            return (lb * ub) / (self.high - self.low)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+
+        lb_bool = control_flow.less_than(self.low, value)
+        ub_bool = control_flow.less_than(value, self.high)
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_div((lb * ub), (self.high - self.low), name=name)
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        return nn.log(self.high - self.low, name=name)
+
+
+class Normal(Distribution):
+    """The Normal distribution with location `loc` and `scale` parameters.
+
+    Mathematical details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+
+    .. math::
+
+        Z = (2 \pi \sigma^2)^{0.5}
+
+    In the above equation:
+
+    * :math:`loc = \mu`: is the mean.
+    * :math:`scale = \sigma`: is the std.
+    * :math:`Z`: is the normalization constant.
+
+    Args:
+        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+          
+          import numpy as np
+          import paddle
+          from paddle.distribution import Normal
+
+          paddle.disable_static()
+          # Define a single scalar Normal distribution.
+          dist = Normal(loc=0., scale=3.)
+          # Define a batch of two scalar valued Normals.
+          # The first has mean 1 and standard deviation 11, the second 2 and 22.
+          dist = Normal(loc=[1., 2.], scale=[11., 22.])
+          # Get 3 samples, returning a 3 x 2 tensor.
+          dist.sample([3])
+
+          # Define a batch of two scalar valued Normals.
+          # Both have mean 1, but different standard deviations.
+          dist = Normal(loc=1., scale=[11., 22.])
+
+          # Complete example
+          value_npdata = np.array([0.8], dtype="float32")
+          value_tensor = paddle.to_tensor(value_npdata)
+
+          normal_a = Normal([0.], [1.])
+          normal_b = Normal([0.5], [2.])
+          sample = normal_a.sample([2])
+          # a random tensor created by normal distribution with shape: [2, 1]
+          entropy = normal_a.entropy()
+          # [1.4189385] with shape: [1]
+          lp = normal_a.log_prob(value_tensor)
+          # [-1.2389386] with shape: [1]
+          p = normal_a.probs(value_tensor)
+          # [0.28969154] with shape: [1]
+          kl = normal_a.kl_divergence(normal_b)
+          # [0.34939718] with shape: [1]
+    """
+
+    def __init__(self, loc, scale, name=None):
+        if not in_dygraph_mode():
+            check_type(loc, 'loc',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Normal')
+            check_type(scale, 'scale',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Normal')
+
+        self.batch_size_unknown = False
+        self.all_arg_is_float = False
+        self.name = name if name is not None else 'Normal'
+
+        if isinstance(loc, int):
+            loc = float(loc)
+        if isinstance(scale, int):
+            scale = float(scale)
+
+        if self._validate_args(loc, scale):
+            self.batch_size_unknown = True
+            self.loc = loc
+            self.scale = scale
+        else:
+            if isinstance(loc, float) and isinstance(scale, float):
+                self.all_arg_is_float = True
+            self.loc, self.scale = self._to_tensor(loc, scale)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        batch_shape = list((self.loc + self.scale).shape)
+        name = self.name + '_sample'
+
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            zero_tmp_shape = nn.shape(zero_tmp_reshape)
+            normal_random_tmp = nn.gaussian_random(
+                zero_tmp_shape,
+                mean=0.,
+                std=1.,
+                seed=seed,
+                dtype=convert_dtype(self.loc.dtype))
+            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            return output
+        else:
+            output_shape = shape + batch_shape
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
+                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Tensor: Shannon entropy of normal distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        batch_shape = list((self.loc + self.scale).shape)
+        zero_tmp = tensor.fill_constant_batch_size_like(
+            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+        return elementwise_add(
+            0.5 + zero_tmp,
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
+            name=name)
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        if not in_dygraph_mode():
+            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                     'log_prob')
+
+        name = self.name + '_log_prob'
+        var = self.scale * self.scale
+        log_scale = nn.log(self.scale)
+        return elementwise_sub(
+            -1. * ((value - self.loc) * (value - self.loc)) / (2. * var),
+            log_scale + math.log(math.sqrt(2. * math.pi)),
+            name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        if not in_dygraph_mode():
+            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                     'log_prob')
+
+        name = self.name + '_probs'
+        var = self.scale * self.scale
+        return elementwise_div(
+            ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                    (2. * var)), (math.sqrt(2 * math.pi) * self.scale),
+            name=name)
+
+    def kl_divergence(self, other):
+        """The KL-divergence between two normal distributions.
+
+        Args:
+            other (Normal): instance of Normal.
+
+        Returns:
+            Tensor: kl-divergence between two normal distributions.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(other, 'other', Normal, 'kl_divergence')
+
+        name = self.name + '_kl_divergence'
+        var_ratio = self.scale / other.scale
+        var_ratio = (var_ratio * var_ratio)
+        t1 = (self.loc - other.loc) / other.scale
+        t1 = (t1 * t1)
+        return elementwise_add(
+            0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
diff --git a/python/paddle/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/fleet/meta_optimizers/pipeline_optimizer.py
deleted file mode 100644
index 9fd919f30f688d1b12fac258c2d6c9dc47fbf049..0000000000000000000000000000000000000000
--- a/python/paddle/fleet/meta_optimizers/pipeline_optimizer.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-from paddle.fluid.optimizer import PipelineOptimizer as PO
-from .meta_optimizer_base import MetaOptimizerBase
-
-__all__ = ["PipelineOptimizer"]
-
-
-class PipelineOptimizer(MetaOptimizerBase):
-    def __init__(self, optimizer):
-        super(PipelineOptimizer, self).__init__(optimizer)
-        self.inner_opt = optimizer
-        # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
-
-    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
-                        user_defined_strategy):
-        super(PipelineOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        num_microbatches = user_defined_strategy.pipeline_configs['micro_batch']
-        self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
-
-    def _can_apply(self):
-        if self.user_defined_strategy.pipeline == True:
-            return True
-        return False
-
-    def _disable_strategy(self, dist_strategy):
-        dist_strategy.pipeline = False
-        dist_strategy.pipeline_configs = {"micro_batch": 1}
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        return self.wrapped_opt.backward(loss, startup_program, parameter_list,
-                                         no_grad_set, callbacks)
-
-    def minimize_impl(self,
-                      loss,
-                      startup_program=None,
-                      parameter_list=None,
-                      no_grad_set=None):
-        optimize_ops, params_grads, prog_list = \
-            self.wrapped_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
-        return optimize_ops, params_grads
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 88dd815d937a4778b0d24a90d448a262689907f3..9f748b7956f9faa6b1c948d87f0ef4659057a421 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,7 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -89,6 +89,7 @@ from .dygraph.base import enable_dygraph, disable_dygraph
 from .io import save, load, load_program_state, set_program_state
 from .dygraph.checkpoint import save_dygraph, load_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
+from . import generator
 Tensor = LoDTensor
 enable_imperative = enable_dygraph
 disable_imperative = disable_dygraph
@@ -96,7 +97,7 @@ disable_imperative = disable_dygraph
 __all__ = framework.__all__ + executor.__all__ + \
     trainer_desc.__all__ + transpiler.__all__ + \
     parallel_executor.__all__ + lod_tensor.__all__ + \
-    data_feed_desc.__all__ + compiler.__all__ + backward.__all__  + [
+    data_feed_desc.__all__ + compiler.__all__ + backward.__all__  + generator.__all__ + [
         'io',
         'initializer',
         'embedding',
@@ -118,6 +119,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'LoDTensor',
         'LoDTensorArray',
         'CPUPlace',
+        'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
         'Tensor',
@@ -194,6 +196,7 @@ def __bootstrap__():
         'free_idle_chunk',
         'free_when_no_cache_hit',
         'call_stack_level',
+        'sort_sum_gradient',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 7b301ac19d1d3dc1f4aabb6cf3af2f0874faa677..5f6594a47213021c3a82dd4a0266f52240270e87 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -129,7 +129,7 @@ class GradientClipBase(object):
     def __str__(self):
         raise NotImplementedError()
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         raise NotImplementedError
 
@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         sum_square_list = []
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index cece2ba4a3d788ab2df4c0a6a847c9597d36047a..e3755cbafea41e61352f67c3de040e700297b61a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -15,6 +15,7 @@
 import logging
 import numpy as np
 import sys
+import paddle
 from paddle.fluid import dygraph
 from paddle.fluid.dygraph.nn import Conv2D
 from paddle.fluid.dygraph.nn import Linear
@@ -195,13 +196,16 @@ class ImperativeQuantAware(object):
         with dygraph.guard():
             model.eval()
             input_vars = []
-            for shape, dtype in zip(input_shape, input_dtype):
-                raw_data = np.random.random(shape)
-                input_data = raw_data[np.newaxis, :].astype(
-                    dtype) if append_batch_size else raw_data.astype(dtype)
-                input_var = dygraph.to_variable(input_data)
-                input_vars.append(input_var)
-            outputs = prog_trans.get_output(model.forward, model, *input_vars)
+            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
+                if append_batch_size:
+                    shape = [None] + list(shape)
+                # Note(Aurelius84): need a elegant way to name this.
+                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
+                input_vars.append(in_spec)
+            # use `declarative` to convert dygraph into static program
+            model.forward = dygraph.jit.declarative(
+                model.forward, input_spec=input_vars)
+            outputs = model.forward.concrete_program.outputs
         input_spec = [input_vars[i] for i in feed]
         configs = dygraph.jit.SaveLoadConfig()
         configs.separate_params = True
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 3097e1d82a9cb5e096efa3913ea6a06bff557c94..244a621611060b87805846f1ea748615bcdde19a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -29,6 +29,7 @@ from .quantization_pass import _out_scale_op_list
 from .quantization_pass import _get_op_input_var_names
 from .quantization_pass import _get_op_output_var_names
 from .quantization_pass import _get_output_name_index
+from .quantization_pass import _channelwise_quant_axis1_ops
 
 __all__ = ['PostTrainingQuantization', 'WeightQuantization']
 
@@ -316,6 +317,7 @@ class PostTrainingQuantization(object):
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
+        self.weight_op_pairs = {}
         self._sampling_data = {}
         self._quantized_var_kl_threshold = {}
         self._quantized_var_min = {}
@@ -436,6 +438,8 @@ class PostTrainingQuantization(object):
         graph = IrGraph(core.Graph(self._program.desc), for_test=True)
         graph = _remove_ctrl_vars(graph)
         graph = _apply_pass(self._scope, graph, 'conv_bn_fuse_pass')
+        graph = _apply_pass(self._scope, graph, 'depthwise_conv_bn_fuse_pass')
+        graph = _apply_pass(self._scope, graph, 'conv_transpose_bn_fuse_pass')
         self._program = graph.to_program()
 
     def _collect_target_varnames(self):
@@ -446,10 +450,11 @@ class PostTrainingQuantization(object):
         # TODO(juncaipeng), consider the name_scope of skip_quant
         _logger.info("Collect quantized variable names ...")
 
-        def collect_var_name(var_name_list, persistable_var_names):
+        def collect_var_name(var_name_list, persistable_var_names, op_type):
             for var_name in var_name_list:
                 if var_name in persistable_var_names:
                     self._quantized_weight_var_name.add(var_name)
+                    self.weight_op_pairs[var_name] = op_type
                 else:
                     self._quantized_act_var_name.add(var_name)
 
@@ -462,13 +467,15 @@ class PostTrainingQuantization(object):
             # For quantized ops, sample inputs and outputs
             if op_type in self._quantizable_op_type:
                 collect_var_name(
-                    _get_op_input_var_names(op), persistable_var_names)
+                    _get_op_input_var_names(op), persistable_var_names, op_type)
                 collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names)
+                    _get_op_output_var_names(op), persistable_var_names,
+                    op_type)
             # For other op, only sample output scale
             elif op_type in self._out_scale_op_list:
                 collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names)
+                    _get_op_output_var_names(op), persistable_var_names,
+                    op_type)
 
     def _set_activation_persistable(self):
         '''
@@ -492,45 +499,75 @@ class PostTrainingQuantization(object):
         Sample the input threshold(min, max, or abs_max) in every iterations.
         '''
         assert self._algo in ["abs_max", "min_max"], \
-            "The algo should be abs_max or min_max to sample min max value."
-
+            "The algo should be abs_max or min_max for _sample_threshold."
         if self._algo == "abs_max":
-            # Only calculate abs_max value for weight for once
-            if self._quantized_var_abs_max == {}:
-                for var_name in self._quantized_weight_var_name:
-                    var_tensor = _load_variable_data(self._scope, var_name)
-                    abs_max_per_channel = []
-                    for i in range(var_tensor.shape[0]):
-                        abs_max_per_channel.append(
-                            float(np.max(np.abs(var_tensor[i]))))
-                    self._quantized_var_abs_max[var_name] = abs_max_per_channel
-            for var_name in self._quantized_act_var_name:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                abs_max_value = float(np.max(np.abs(var_tensor)))
-                if (var_name not in self._quantized_var_abs_max) or \
-                    (abs_max_value > self._quantized_var_abs_max[var_name]):
-                    self._quantized_var_abs_max[var_name] = abs_max_value
+            self._sample_threshold_abs_max()
         elif self._algo == "min_max":
-            if self._quantized_var_min == {} and self._quantized_var_max == {}:
-                for var_name in self._quantized_weight_var_name:
-                    var_tensor = _load_variable_data(self._scope, var_name)
-                    min_per_channel = []
-                    max_per_channle = []
-                    for i in range(var_tensor.shape[0]):
-                        min_per_channel.append(float(np.min(var_tensor[i])))
-                        max_per_channle.append(float(np.max(var_tensor[i])))
-                    self._quantized_var_min[var_name] = min_per_channel
-                    self._quantized_var_max[var_name] = max_per_channle
-            for var_name in self._quantized_act_var_name:
+            self._sample_threshold_min_max()
+
+    def _sample_threshold_abs_max(self):
+        assert self._algo == "abs_max", \
+            "The algo should be abs_max for _sample_threshold_abs_max."
+        # Only calculate abs_max value for weight for once
+        if self._quantized_var_abs_max == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self.weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_var_abs_max[var_name] = abs_max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            if (var_name not in self._quantized_var_abs_max) or \
+                (abs_max_value > self._quantized_var_abs_max[var_name]):
+                self._quantized_var_abs_max[var_name] = abs_max_value
+
+    def _sample_threshold_min_max(self):
+        assert self._algo == "min_max", \
+            "The algo should be min_max for _sample_threshold_min_max."
+        if self._quantized_var_min == {} and self._quantized_var_max == {}:
+            for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
-                min_value = float(np.min(var_tensor))
-                max_value = float(np.max(var_tensor))
-                if (var_name not in self._quantized_var_min) or \
-                    (min_value < self._quantized_var_min[var_name]):
-                    self._quantized_var_min[var_name] = min_value
-                if (var_name not in self._quantized_var_max) or \
-                    (max_value > self._quantized_var_max[var_name]):
-                    self._quantized_var_max[var_name] = max_value
+                if self._weight_quantize_type == "abs_max":
+                    min_value = float(np.min(var_tensor))
+                    max_value = float(np.max(var_tensor))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    min_value = []
+                    max_value = []
+                    if self.weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            min_value.append(float(np.min(var_tensor[:, i])))
+                            max_value.append(float(np.max(var_tensor[:, i])))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            min_value.append(float(np.min(var_tensor[i])))
+                            max_value.append(float(np.max(var_tensor[i])))
+                self._quantized_var_min[var_name] = min_value
+                self._quantized_var_max[var_name] = max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            min_value = float(np.min(var_tensor))
+            max_value = float(np.max(var_tensor))
+            if (var_name not in self._quantized_var_min) or \
+                (min_value < self._quantized_var_min[var_name]):
+                self._quantized_var_min[var_name] = min_value
+            if (var_name not in self._quantized_var_max) or \
+                (max_value > self._quantized_var_max[var_name]):
+                self._quantized_var_max[var_name] = max_value
 
     def _save_input_threhold(self):
         '''
@@ -554,11 +591,6 @@ class PostTrainingQuantization(object):
         applied in every iteration.
         '''
         assert self._algo == "KL", "The algo should be KL to sample data."
-        for var_name in self._quantized_weight_var_name:
-            if var_name not in self._sampling_data:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                self._sampling_data[var_name] = var_tensor
-
         if self._is_use_cache_file:
             for var_name in self._quantized_act_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
@@ -584,15 +616,20 @@ class PostTrainingQuantization(object):
 
         # Abs_max threshold for weights
         for var_name in self._quantized_weight_var_name:
-            weight_data = self._sampling_data[var_name]
-            weight_threshold = None
+            weight_data = _load_variable_data(self._scope, var_name)
             if self._weight_quantize_type == "abs_max":
-                weight_threshold = np.max(np.abs(weight_data))
+                weight_threshold = float(np.max(np.abs(weight_data)))
             elif self._weight_quantize_type == "channel_wise_abs_max":
                 weight_threshold = []
-                for i in range(weight_data.shape[0]):
-                    abs_max_value = np.max(np.abs(weight_data[i]))
-                    weight_threshold.append(abs_max_value)
+                if self.weight_op_pairs[
+                        var_name] in _channelwise_quant_axis1_ops:
+                    for i in range(weight_data.shape[1]):
+                        weight_threshold.append(
+                            float(np.max(np.abs(weight_data[:, i]))))
+                else:
+                    for i in range(weight_data.shape[0]):
+                        weight_threshold.append(
+                            float(np.max(np.abs(weight_data[i]))))
             self._quantized_var_kl_threshold[var_name] = weight_threshold
 
         # KL threshold for activations
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 8851bcc6440d405f7484257b44760802feb0d8fb..b5a8d901943318ca039b0a73c1be39fb0734e212 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -66,6 +66,9 @@ _out_scale_op_list = [
     "concat",
     "elementwise_mul",
     "scale",
+    "hard_swish",
+    "hard_sigmoid",
+    "conv2d_transpose",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -109,8 +112,14 @@ _op_real_in_out_name = {
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
+    "hard_swish": [["X"], ["Out"]],
+    "hard_sigmoid": [["X"], ["Out"]],
 }
 
+_conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
+
+_channelwise_quant_axis1_ops = ['conv2d_transpose', 'mul']
+
 
 def _get_op_input_var_names(op):
     """ """
@@ -185,10 +194,24 @@ def _is_input_all_not_persistable(graph, op_node):
     return is_input_all_not_persistable
 
 
+def _check_grandchild_op_node(op_node, grandchild_op_name):
+    '''
+    Check whether the fake_quant node has a grandchild op node named
+    grandchild_op_name.
+    '''
+    for out1_var_node in op_node.outputs:
+        for out1_op_node in out1_var_node.outputs:
+            for out2_var_node in out1_op_node.outputs:
+                for out2_op_node in out2_var_node.outputs:
+                    if out2_op_node.name() == grandchild_op_name:
+                        return True
+    return False
+
+
 class QuantizationTransformPass(object):
     """
-    Quantize the ops that have weights. Add quant and dequant ops for the quantized
-    ops's inputs.
+    Quantize the ops that have weights. Add quant and dequant ops for
+    the quantized ops's inputs.
     """
     _supported_quantizable_op_type = [
         'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul'
@@ -311,8 +334,8 @@ class QuantizationTransformPass(object):
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'."
-                % (str(weight_quantize_type)))
+                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' "
+                "or 'moving_average_abs_max'." % (str(weight_quantize_type)))
 
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
@@ -323,7 +346,6 @@ class QuantizationTransformPass(object):
         for op in self._quantizable_ops:
             assert op in QuantizationTransformPass._supported_quantizable_op_type, \
                 op + " is not supported for quantization."
-        self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
@@ -356,10 +378,12 @@ class QuantizationTransformPass(object):
             user_skipped = False
             if isinstance(self._skip_pattern, list):
                 user_skipped = op_node.op().has_attr("op_namescope") and \
-                               any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
+                               any(pattern in op_node.op().attr("op_namescope") \
+                                   for pattern in self._skip_pattern)
             elif isinstance(self._skip_pattern, str):
                 user_skipped = op_node.op().has_attr("op_namescope") and \
-                               op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
+                               op_node.op().attr("op_namescope").find(
+                                   self._skip_pattern) != -1
 
             if user_skipped:
                 op_node.op()._set_attr("skip_quant", True)
@@ -373,15 +397,11 @@ class QuantizationTransformPass(object):
                 if var_node.name() in dequantized_vars:
                     dequant_var_node = dequantized_vars[var_node.name()]
                 else:
-
                     name = var_node.name()
                     if name in processed_vars:
                         continue
-
-                    if var_node.name() in persistable_vars:
-                        is_weight = True
-                    else:
-                        is_weight = False
+                    is_weight = True if var_node.name() in persistable_vars \
+                        else False
 
                     # if var node is weight and weight_preprocess_func is not None,
                     # will insert weight preprocess func 
@@ -415,20 +435,14 @@ class QuantizationTransformPass(object):
                         else self._activation_bits
                     quant_type = self._weight_quantize_type if is_weight \
                         else self._activation_quantize_type
-                    if quant_type == 'channel_wise_abs_max':
-                        assert is_weight, "'channel_wise_abs_max' can only be applied on weights."
-                        if op.name() in self._conv_ops:
-                            quant_var_node, scale_var_node = self._insert_channel_quant_op(
-                                graph, var_node, name, quant_bits)
-                            dequant_var_node = self._insert_channel_dequant_op(
-                                graph, quant_var_node, [scale_var_node],
-                                [quant_bits])
-                        else:
-                            quant_var_node, scale_var_node = self._insert_quant_op(
-                                graph, var_node, name, quant_bits, 'abs_max')
-                            dequant_var_node = self._insert_dequant_op(
-                                graph, quant_var_node, scale_var_node,
-                                quant_bits)
+                    if quant_type == 'channel_wise_abs_max':  # Weight quantization
+                        quant_axis = 1 if op.name() in \
+                            _channelwise_quant_axis1_ops else 0
+                        quant_var_node, scale_var_node = self._insert_channel_quant_op(
+                            graph, var_node, name, quant_bits, quant_axis)
+                        dequant_var_node = self._insert_channel_dequant_op(
+                            graph, quant_var_node, [scale_var_node],
+                            [quant_bits], quant_axis)
                     else:
                         quant_var_node, scale_var_node = self._insert_quant_op(
                             graph, var_node, name, quant_bits, quant_type)
@@ -529,11 +543,19 @@ class QuantizationTransformPass(object):
             var_type=var_node.type(),
             shape=var_node.shape(),
             var_dtype=var_node.dtype())
-        scale_var_node = graph.create_var_node(
+        scale_var_node = graph.create_persistable_node(
             name=self._quantized_scale_name(name),
             var_type=var_node.type(),
             shape=[1],
             var_dtype=var_node.dtype())
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        _init_var_node(
+            scale_var_node,
+            np.zeros(
+                scale_var_node.shape(), dtype=data_type),
+            self._scope,
+            self._place)
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
             attrs={
@@ -706,7 +728,8 @@ class QuantizationTransformPass(object):
 
         return quant_var_node, scale_out_node
 
-    def _insert_channel_quant_op(self, graph, var_node, name, quant_bits):
+    def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
+                                 quant_axis):
         """
         Insert fake_channel_wise_quantize_abs_max op in the graph.
         """
@@ -717,15 +740,24 @@ class QuantizationTransformPass(object):
             var_type=var_node.type(),
             shape=var_node.shape(),
             var_dtype=var_node.dtype())
-        scale_var_node = graph.create_var_node(
+        scale_var_node = graph.create_persistable_node(
             name=self._quantized_scale_name(name),
             var_type=var_node.type(),
-            shape=[var_node.shape()[0]],
+            shape=[var_node.shape()[quant_axis]],
             var_dtype=var_node.dtype())
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        _init_var_node(
+            scale_var_node,
+            np.zeros(
+                scale_var_node.shape(), dtype=data_type),
+            self._scope,
+            self._place)
         quant_op_node = graph.create_op_node(
             op_type='fake_channel_wise_quantize_abs_max',
             attrs={
                 'bit_length': quant_bits,
+                'quant_axis': quant_axis,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
@@ -763,7 +795,7 @@ class QuantizationTransformPass(object):
         return dequant_var_node
 
     def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
-                                   quant_bits):
+                                   quant_bits, quant_axis):
         """
         Insert fake_channel_wise_dequantize_max_abs in the graph.
         """
@@ -778,6 +810,7 @@ class QuantizationTransformPass(object):
             op_type='fake_channel_wise_dequantize_max_abs',
             attrs={
                 'quant_bits': quant_bits,
+                'quant_axis': quant_axis,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node,
@@ -1036,7 +1069,6 @@ class QuantizationFreezePass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._weight_quantize_type = weight_quantize_type
-        self._conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
         self._fake_quant_op_names = _fake_quant_op_list
         self._fake_dequant_op_names = _fake_dequant_op_list
         self._op_input_rename_map = collections.OrderedDict()
@@ -1063,34 +1095,37 @@ class QuantizationFreezePass(object):
                     if input_arg_name in graph.out_node_mapping_table.keys():
                         input_arg_name = graph.out_node_mapping_table[
                             input_arg_name]
-                if input_arg_name in persistable_vars:
-                    if self._weight_quantize_type == 'abs_max':
-                        param = self._load_var(input_arg_name)
-                        scale_v = np.max(np.abs(param))
-                    elif self._weight_quantize_type == 'channel_wise_abs_max':
-                        param = self._load_var(input_arg_name)
-                        if len(param.shape) == 4:  # conv2d or depthwise_conv2d
-                            scale_v = []
-                            for i in range(param.shape[0]):
-                                scale_v.append(np.max(np.abs(param[i])))
-                        else:
-                            scale_v = np.max(np.abs(param))
+                if input_arg_name not in persistable_vars:
+                    scale_v = graph._find_node_by_name(
+                        op_node.outputs, op_node.output('OutScale')[0])
+                    self._quant_var_scale_map[input_arg_name] = scale_v
+                else:
+                    # Obtain scale from OutScale var node
+                    scale_v = self._load_var(op_node.output('OutScale')[0])
+                    assert scale_v.ndim in [
+                        1, 2
+                    ], "the dim of scale_v should be 1 or 2"
+                    if scale_v.ndim == 2:
+                        scale_v = scale_v[0]
+                    if scale_v.size == 1:
+                        scale_v = scale_v[0]
                     else:
-                        scale_v = self._load_var(
-                            op_node.output('OutScale')[0])[0]
+                        scale_v = scale_v.tolist()
                     self._quant_var_scale_map[input_arg_name] = scale_v
-                    self._remove_fake_quant_and_dequant_op(graph, op_node)
-                    # quantize weight and restore
+                    # Quantize weight and restore
                     param_v = self._load_var(input_arg_name)
-                    quantized_param_v = self._quant(param_v, scale_v,
-                                                    self._weight_bits)
+                    if isinstance(scale_v, list) and \
+                        any(_check_grandchild_op_node(op_node, op)
+                        for op in _channelwise_quant_axis1_ops):
+                        quant_axis = 1
+                    else:
+                        quant_axis = 0
+                    quantized_param_v = self._quant(
+                        param_v, scale_v, self._weight_bits, quant_axis)
                     self._restore_var(input_arg_name, quantized_param_v)
-                else:
-                    scale_v = graph._find_node_by_name(
-                        op_node.outputs, op_node.output('OutScale')[0])
-                    self._quant_var_scale_map[input_arg_name] = scale_v
+                    self._remove_fake_quant_and_dequant_op(graph, op_node)
 
-        # Remove all fake dequant op
+# Remove all fake dequant op
         ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
@@ -1103,8 +1138,7 @@ class QuantizationFreezePass(object):
             op_node_desc = op_node.op()
             if op_node_desc.has_attr("quantization_type") and \
                 op_node_desc.attr("quantization_type") == "qat_with_weight":
-                if self._weight_quantize_type == 'channel_wise_abs_max' \
-                    and op_node.name() in self._conv_ops:
+                if self._weight_quantize_type == 'channel_wise_abs_max':
                     self._insert_post_channel_dequant_op(graph, op_node)
                 else:
                     self._insert_post_dequant_op(graph, op_node)
@@ -1295,10 +1329,15 @@ class QuantizationFreezePass(object):
         return isinstance(v, float) or isinstance(v, np.float32) \
             or isinstance(v, np.float64)
 
-    def _quant(self, x, scale, num_bits):
+    def _quant(self, x, scale, num_bits, quant_axis):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
         if isinstance(scale, list):
             for i, s in enumerate(scale):
-                x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                if quant_axis == 0:
+                    x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                else:
+                    x[:, i] = np.round(x[:, i] / s * (
+                        (1 << (num_bits - 1)) - 1))
             return x
         else:
             return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
@@ -1468,6 +1507,10 @@ class OutScaleForTrainingPass(object):
         for op in target_ops:
             for output_var_name in _get_op_output_var_names(op):
                 in_node = graph._find_node_by_name(op.outputs, output_var_name)
+                if in_node.dtype() not in \
+                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                    continue
+
                 scale_node = graph.create_persistable_node(
                     name=self._scale_name(in_node.name()),
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
@@ -1570,17 +1613,26 @@ class OutScaleForInferencePass(object):
             if op_node.name() in self._teller_set:
                 var_names = _get_op_output_var_names(op_node)
                 for var_name in var_names:
-                    # For compatibility, we save output threshold by two methods.
+                    in_node = graph._find_node_by_name(op_node.outputs,
+                                                       var_name)
+                    if in_node.dtype() not in \
+                        [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                        continue
+
                     scale_name = self._scale_name(var_name)
-                    scale_v = np.array(
-                        self._scope.find_var(scale_name).get_tensor())[0]
-                    op_node.op()._set_attr("out_threshold", float(scale_v))
+                    scale_var = self._scope.find_var(scale_name)
+                    assert scale_var is not None, \
+                        "Can not find {} variable in the scope".format(scale_name)
+                    scale_value = np.array(scale_var.get_tensor())[0]
+
+                    # For compatibility, we save output threshold by two methods.
+                    op_node.op()._set_attr("out_threshold", float(scale_value))
 
                     argname_index = _get_output_name_index(op_node, var_name)
                     assert argname_index is not None, \
                         var_name + " is not the output of the op"
                     op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \
-                        + "_threshold", float(scale_v))
+                        + "_threshold", float(scale_value))
         graph.resolve_hazard()
         return graph
 
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index df7e585d45f445067b3a700951418c06c9062ae7..007d701284dfc7ff2cafb128984414517579fce3 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -123,6 +123,7 @@ endfunction()
 
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
     list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
     list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
@@ -263,6 +264,13 @@ list(REMOVE_ITEM TEST_OPS
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
 LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+# setting timeout value for old unittests
+if(NOT WIN32)
+    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
+	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac1590b8aa6eaefbccd3907b314fb438386ffc6
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -0,0 +1,226 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path, exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(self.int8_model_path)
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "mul"]
+        is_full_quantize = True
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 10
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index c9ea15bf6cde9af16810920f53a7d5e045a852e3..32292c8a47b50bc5e7eb2d7833823e586eea8909 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -33,34 +33,29 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
 
-def residual_block(img, label, num=1):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            use_cudnn=False,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    hidden = img
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 20, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 20, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        pool_type='max',
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        pool_type='avg',
+        act="relu")
+    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
 
 
 def pact(x, name=None):
@@ -102,7 +97,7 @@ class TestUserDefinedQuantization(unittest.TestCase):
                     img.stop_gradient = False
                     label = fluid.layers.data(
                         name='label', shape=[1], dtype='int64')
-                    loss = residual_block(img, label, 1)
+                    loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.SGD(learning_rate=0.0001)
                         opt.minimize(loss)
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
index 51e1455e71ecfe3f347977bea17a56e556c5ce0d..b964168eb3a2f14fa6dd55d189592daa6ec93d3c 100644
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
@@ -36,8 +36,9 @@ class TestDistributedReader(unittest.TestCase):
         data = next(reader())
         assert data == 1
 
-        os.unsetenv('PADDLE_TRAINER_ID')
-        os.unsetenv('PADDLE_TRAINERS_NUM')
+        #Note: windows python3 don't have unsetenv
+        del os.environ['PADDLE_TRAINER_ID']
+        del os.environ['PADDLE_TRAINERS_NUM']
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d87363abf14cdfc3e29567bd41dbac387b882f76..a05aa3b0a84b57bb1f9ce00b0ad007280c316c6e 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -201,10 +201,13 @@ def pre_load(dso_name):
 
 
 def get_glibc_ver():
-    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'").strip()
+    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
 
 
 def less_than_ver(a, b):
+    if a is None or b is None:
+        return False
+
     import re
     import operator
 
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 2c75c493cba02dc21a5e2518a8a5e52b6eb4fd81..dc57e9f71ed3d0de1a374bdf719b32a083198b31 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -18,17 +18,14 @@ import six
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_dtype, check_type
+from ..utils import deprecated
 
 __all__ = ['data']
 
 
+@deprecated(since="2.0.0", update_to="paddle.static.data")
 def data(name, shape, dtype='float32', lod_level=0):
     """
-    :api_attr: Static Graph
-	:alias_main: paddle.nn.data
-	:alias: paddle.nn.data,paddle.nn.input.data
-	:old_api: paddle.fluid.data
-
     **Data Layer**
 
     This function creates a variable on the global block. The global variable
@@ -52,7 +49,7 @@ def data(name, shape, dtype='float32', lod_level=0):
 
         The default :code:`stop_gradient` attribute of the Variable created by
         this API is true, which means the gradient won't be passed backward
-        through the data Varaible. Set :code:`var.stop_gradient = False` If
+        through the data Variable. Set :code:`var.stop_gradient = False` If
         user would like to pass backward gradient.
 
     Args:
@@ -88,7 +85,7 @@ def data(name, shape, dtype='float32', lod_level=0):
 
           z = x + y
 
-          # In this example, we will feed x and y with np-ndarry "1"
+          # In this example, we will feed x and y with np-ndarray "1"
           # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
           feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index e8d708e04ce54bf6589ada0a55de13f06f0ba2a9..45aa85d4168a55e206460ce2e39292013caa9ce0 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -50,14 +50,15 @@ def convert_dtype(dtype):
     elif isinstance(dtype, type):
         if dtype in [
                 np.bool, np.float16, np.float32, np.float64, np.int8, np.int16,
-                np.int32, np.int64, np.uint8
+                np.int32, np.int64, np.uint8, np.complex64, np.complex128
         ]:
             return dtype.__name__
     else:
         if dtype in [
                 'bool', 'float16', 'float32', 'float64', 'int8', 'int16',
-                'int32', 'int64', 'uint8', u'bool', u'float16', u'float32',
-                u'float64', u'int8', u'int16', u'int32', u'int64', u'uint8'
+                'int32', 'int64', 'uint8', 'complex64', 'complex128', u'bool',
+                u'float16', u'float32', u'float64', u'int8', u'int16', u'int32',
+                u'int64', u'uint8', u'complex64', u'complex128'
         ]:
             # this code is a little bit dangerous, since error could happen
             # when casting no-ascii code to str in python2.
@@ -68,7 +69,7 @@ def convert_dtype(dtype):
 
     raise TypeError(
         "dtype must be any of [bool, float16, float32, float64, int8, int16, "
-        "int32, int64, uint8], but received %s" % dtype)
+        "int32, int64, uint8, complex64, complex128], but received %s" % dtype)
 
 
 def check_variable_and_dtype(input,
diff --git a/python/paddle/fluid/dataloader/__init__.py b/python/paddle/fluid/dataloader/__init__.py
index 62aefd6aec8cb92faf4595a821381f35c2abd5bd..597f1f217483ccafe73d0b4fe337cb2b24b4b436 100644
--- a/python/paddle/fluid/dataloader/__init__.py
+++ b/python/paddle/fluid/dataloader/__init__.py
@@ -20,5 +20,13 @@ from .dataset import *
 from . import batch_sampler
 from .batch_sampler import *
 
+from . import dataloader_iter
+from .dataloader_iter import *
+
+from . import sampler
+from .sampler import *
+
 __all__ = dataset.__all__ \
-        + batch_sampler.__all__
+        + batch_sampler.__all__ \
+        + dataloader_iter.__all__ \
+        + sampler.__all__
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index c6163f7da1ee6de139b7503e8e6a6c3722ea8a7b..1d180329b72510de5e7e9362e4c002f4508ba1be 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -16,12 +16,13 @@ from __future__ import print_function
 from __future__ import division
 
 import numpy as np
-from .dataset import Dataset
+from .sampler import Sampler, SequenceSampler, RandomSampler
+from .dataset import Dataset, IterableDataset
 
 __all__ = ["BatchSampler"]
 
 
-class BatchSampler(object):
+class BatchSampler(Sampler):
     """
     A base implement of batch sampler used by `paddle.io.DataLoader`
     which yield mini-batch indices(a list/tuple with length as
@@ -41,10 +42,11 @@ class BatchSampler(object):
                 implement or other python object which implemented
                 :code:`__len__` for BatchSampler to get indices as the
                 range of :attr:`dataset` length. Default None.
-        indices (list|tuple): a substitution parameter for
-                :attr:`dataset` either :attr:`dataset` or
-                :attr:`indices` should be set, give the whole
-                indices to sampler from directly. Default None.
+        sampler (Sampler): this could be a :code:`paddle.io.Dataset`
+                instance which implemented :code:`__iter__` to yield
+                sample indices. :attr:`sampler` and :attr:`dataset`
+                can not be set in the same time.  If :attr:`sampler`
+                is set, :attr:`shuffle` should not be set. Default None.
         shuffle(bool): whether to shuffle indices order before genrating
                 batch indices. Default False.
         batch_size(int): sample indice number in a mini-batch indices.
@@ -58,16 +60,7 @@ class BatchSampler(object):
         
         .. code-block:: python
             
-            from paddle.io import BatchSampler, Dataset
-
-            # init with indices
-            bs = BatchSampler(indices=list(range(100)),
-                              shuffle=True,
-                              batch_size=8,
-                              drop_last=True)
-
-            for batch_indices in bs:
-                print(batch_indices)
+            from paddle.io import RandomSampler, BatchSampler, Dataset
 
             # init with dataset
             class RandomDataset(Dataset):
@@ -90,46 +83,57 @@ class BatchSampler(object):
             for batch_indices in bs:
                 print(batch_indices)
 
+            # init with sampler
+            sampler = RandomSampler(RandomDataset(100))
+            bs = BatchSampler(sampler=sampler,
+                              batch_size=8,
+                              drop_last=True)
+
+            for batch_indices in bs:
+                print(batch_indices)
+
+
     see `paddle.io.DataLoader`
 
     """
 
     def __init__(self,
                  dataset=None,
-                 indices=None,
+                 sampler=None,
                  shuffle=False,
                  batch_size=1,
                  drop_last=False):
         if dataset is None:
-            assert indices is not None, \
-                "either dataset or indices should be set"
-            assert isinstance(indices, list) or isinstance(indices, tuple), \
-                "indices should be a list or tuple, but got {}".format(type(indices))
-            self.indices = indices
+            assert sampler is not None, \
+                "either dataset or sampler should be set"
+            assert isinstance(sampler, Sampler), \
+                "sampler should be a paddle.io.Sampler, but got {}".format(type(sampler))
+            assert not shuffle, "shuffle should be False when sampler is set"
+            self.sampler = sampler
         else:
             assert isinstance(dataset, Dataset), \
-                "dataset should be an instance of paddle.io.Dataset"
-            assert indices is None, \
-                "should not set both dataset and indices"
-            self.indices = list(range(len(dataset)))
+                "dataset should be a paddle.io.Dataset"
+            assert not isinstance(dataset, IterableDataset), \
+                "dataset should not be a paddle.io.IterableDataset"
+            assert sampler is None, \
+                "should not set both dataset and sampler"
+            assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value, but got {}".format(type(shuffle))
+            if shuffle:
+                self.sampler = RandomSampler(dataset)
+            else:
+                self.sampler = SequenceSampler(dataset)
 
         assert isinstance(batch_size, int) and batch_size > 0, \
             "batch_size should be a positive integer, but got {}".format(batch_size)
         self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-            "shuffle should be a boolean value, but got {}".format(type(shuffle))
-        self.shuffle = shuffle
         assert isinstance(drop_last, bool), \
             "drop_last should be a boolean value, but got {}".format(type(drop_last))
         self.drop_last = drop_last
 
     def __iter__(self):
-        if self.shuffle:
-            np.random.shuffle(self.indices)
-        _iter = iter(self.indices)
-
         batch_indices = []
-        for idx in _iter:
+        for idx in self.sampler:
             batch_indices.append(idx)
             if len(batch_indices) == self.batch_size:
                 yield batch_indices
@@ -138,6 +142,19 @@ class BatchSampler(object):
             yield batch_indices
 
     def __len__(self):
-        num_samples = len(self.indices)
+        num_samples = len(self.sampler)
         num_samples += int(not self.drop_last) * (self.batch_size - 1)
         return num_samples // self.batch_size
+
+
+class _InfiniteIterableSampler(object):
+    def __init__(self, dataset, batch_size=1):
+        assert isinstance(
+            dataset, IterableDataset
+        ), "dataset should be an instance of paddle.io.IterableDataset"
+        self.dataset = dataset
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        while True:
+            yield [None] * self.batch_size
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 214cd772af6b1fa6e24ec972c0f0644dc1c09f95..6a996493e4df1e1facc6ccd205a8ae5105f92c5b 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -22,6 +22,7 @@ import itertools
 import threading
 import numpy as np
 import multiprocessing
+from collections import namedtuple
 
 # NOTE: queue has a different name in python2 and python3
 if six.PY2:
@@ -29,14 +30,21 @@ if six.PY2:
 else:
     import queue
 
-from .. import core
+import paddle
+from .. import core, layers
 from ..framework import in_dygraph_mode
 from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
+from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
+
+__all__ = ['get_worker_info']
 
 # multi-process worker check indices queue interval, avoid
 # hanging in subprocess data loading
 MP_INDICES_CHECK_INTERVAL = 5
 
+_IterableDatasetStopIteration = namedtuple('_IterableDatasetStopIteration',
+                                           ['worker_id'])
+
 
 def default_collate_fn(batch):
     """
@@ -72,7 +80,27 @@ def default_collate_fn(batch):
                 slots.append([item])
             else:
                 slots[i].append(item)
-    return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(slots[0][0], np.ndarray):
+        return [np.stack(slot, axis=0) for slot in slots]
+    elif isinstance(slots[0][0], paddle.Tensor):
+        return [layers.stack(slot, axis=0) for slot in slots]
+    else:
+        raise RuntimeError("Unknown data type {}".format(type(slots[0][0])))
+
+
+class _DatasetKind(object):
+    MAP = 0
+    ITER = 1
+
+    @staticmethod
+    def create_fetcher(kind, dataset, collate_fn, drop_last):
+        if kind == _DatasetKind.MAP:
+            return _MapDatasetFetcher(dataset, collate_fn, drop_last)
+        elif kind == _DatasetKind.ITER:
+            return _IterableDatasetFetcher(dataset, collate_fn, drop_last)
+        else:
+            raise NotImplementedError("unknown Dataset kind {}".format(kind))
 
 
 class ParentWatchDog(object):
@@ -86,6 +114,92 @@ class ParentWatchDog(object):
         return self._parent_alive
 
 
+# worker information for each workers, used for splitting data copy
+# for IteratorDataset in worker processes.
+_worker_info = None
+
+
+def get_worker_info():
+    """
+    Get DataLoader worker process information function, this function is
+    used to split data copy in worker process for IterableDataset
+    (see :code:`paddle.io.IterableDataset`), worker information contains
+    following fields:
+
+    :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader`
+
+    :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1`
+
+    :attr:`dataset`: the dataset object in this worker process
+
+    Returns:
+        WorkerInfo: an instance of WorkerInfo which contains fields above.
+
+    .. note::
+        For mode usage and exampls, please see :code:`paddle.io.IterableDataset`
+
+    Example:
+
+        .. code-block:: python
+
+            import math
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class SplitedIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    worker_info = get_worker_info()
+                    if worker_info is None:
+                        iter_start = self.start
+                        iter_end = self.end
+                    else:
+                        per_worker = int(
+                            math.ceil((self.end - self.start) / float(
+                                worker_info.num_workers)))
+                        worker_id = worker_info.id
+                        iter_start = self.start + worker_id * per_worker
+                        iter_end = min(iter_start + per_worker, self.end)
+
+                    for i in range(iter_start, iter_end):
+                        yield np.array([i])
+
+            place = fluid.CPUPlace()
+            with fluid.dygraph.guard(place):
+                dataset = SplitedIterableDataset(start=2, end=9)
+                dataloader = DataLoader(
+                    dataset,
+                    places=place,
+                    num_workers=2,
+                    batch_size=1,
+                    drop_last=True)
+
+                print(list(dataloader))
+                # outputs: [2, 5, 3, 6, 4, 7]
+
+    """
+    return _worker_info
+
+
+class WorkerInfo(object):
+    __initialized = False
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self.__initialized = True
+
+    def __setattr__(self, key, val):
+        if self.__initialized:
+            raise RuntimeError("Cannot assign attributes to {} objects".format(
+                self.__class__.__name__))
+        return super(WorkerInfo, self).__setattr__(key, val)
+
+
 class _DataLoaderIterBase(object):
     """
     Iterator implement of DataLoader, will load and feed mini-batch
@@ -108,6 +222,8 @@ class _DataLoaderIterBase(object):
         self._use_shared_memory = loader.use_shared_memory
         self._timeout = loader.timeout if loader.timeout > 0 else MP_INDICES_CHECK_INTERVAL
         self._worker_init_fn = loader.worker_init_fn
+        self._dataset_kind = loader.dataset_kind
+        self._pin_memory = loader.pin_memory
 
         # LoDTensorBlockingQueue instance for create_py_reader and a thread
         # to put mini-batch data to self._blocking_queue, mini-batch data
@@ -134,6 +250,9 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
     def __init__(self, loader):
         super(_DataLoaderIterSingleProcess, self).__init__(loader)
 
+        self._dataset_fetcher = _DatasetKind.create_fetcher(
+            self._dataset_kind, self._dataset, self._collate_fn, True)
+
         # NOTE: len(self._places) batch data compose as an output
         # iteration, set blocking_queue can cache 2 iteration datas
         # at most here
@@ -154,7 +273,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
             len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_buffer_reader, True)
+            self._need_check_feed, self._places, self._use_buffer_reader, True,
+            self._pin_memory)
 
         self._thread = threading.Thread(target=self._thread_loop)
         self._thread.daemon = True
@@ -164,15 +284,19 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         try:
             for indices in self._sampler_iter:
                 # read data from dataset in mini-batch
-                batch = [self._dataset[i] for i in indices]
-                if self._collate_fn is not None:
-                    batch = self._collate_fn(batch)
+                batch = self._dataset_fetcher.fetch(indices)
 
                 # pack as LoDTensorArray
                 array = core.LoDTensorArray()
                 for slot in batch:
                     if not isinstance(slot, core.LoDTensor):
                         self._check_input_array(slot)
+                        # FIXME(dkp): blocking_queue only support
+                        #             core.LoDTensorArray as input now, read
+                        #             numpy data into a LoDTensorArray here,
+                        #             should support paddle.Tensor list later
+                        if isinstance(slot, paddle.Tensor):
+                            slot = slot.numpy()
                         tmp = core.LoDTensor()
                         tmp.set(slot, core.CPUPlace())
                         slot = tmp
@@ -184,6 +308,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
             self._blocking_queue.close()
             self._thread = None
+        except StopIteration:
+            self._blocking_queue.close()
         except Exception:
             self._blocking_queue.kill()
             self._thread = None
@@ -192,6 +318,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
     @classmethod
     def _check_input_array(cls, item):
+        if isinstance(item, paddle.Tensor):
+            return
         arr = np.array(item)
         if arr.dtype == np.object:
             raise TypeError((
@@ -231,11 +359,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
         # data get from _data_queue will be reordered by _rcvd_idx
         # for data order keeping, data index not equal _rcvd_idx 
-        # will be cached in _reorder_dict
+        # will be cached in _task_infos
         self._send_idx = 0
         self._rcvd_idx = 0
         self._batches_outstanding = 0
-        self._reorder_dict = {}
+        self._task_infos = {}
 
         # indices outstand as _outstanding_capacity at first, and
         # blocking_queue capacity is also _outstanding_capacity.
@@ -246,14 +374,17 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._outstanding_capacity = 2 * max(self._num_workers,
                                              len(self._places))
 
-        self._init_workers()
-        self._init_thread()
-
-        self._shutdown = False
+        # see _try_put_indices
+        self._thread_lock = threading.Lock()
 
+        # init workers and indices queues and put 2 indices in each indices queue
+        self._init_workers()
         for _ in range(self._outstanding_capacity):
             self._try_put_indices()
 
+        self._init_thread()
+        self._shutdown = False
+
     def _init_workers(self):
         # multiprocess worker and indice queue list initial as empty
         self._workers = []
@@ -274,9 +405,10 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             self._indices_queues.append(indices_queue)
             worker = multiprocessing.Process(
                 target=self._worker_loop,
-                args=(self._dataset, indices_queue, self._data_queue,
-                      self._workers_done_event, self._collate_fn,
-                      self._worker_init_fn, i))
+                args=(self._dataset, self._dataset_kind, indices_queue,
+                      self._data_queue, self._workers_done_event,
+                      self._collate_fn, self._worker_init_fn, i,
+                      self._num_workers))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
@@ -307,7 +439,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             core.Variable(), self._outstanding_capacity, len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_buffer_reader, True)
+            self._need_check_feed, self._places, self._use_buffer_reader, True,
+            self._pin_memory)
 
         self._thread_done_event = threading.Event()
         self._thread = threading.Thread(target=self._thread_loop)
@@ -350,8 +483,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _worker_loop(self, dataset, indices_queue, out_queue, done_event,
-                     collate_fn, init_fn, worker_id):
+    def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
+                     done_event, collate_fn, init_fn, worker_id, num_workers):
         try:
             # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
             # some shared memory objects may have been applied for but have not yet
@@ -362,14 +495,21 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             # set signal handler
             core._set_process_signal_handler()
 
+            global _worker_info
+            _worker_info = WorkerInfo(
+                id=worker_id, num_workers=num_workers, dataset=dataset)
+
             init_exception = None
-            if init_fn is not None:
-                try:
+            try:
+                if init_fn is not None:
                     init_fn(worker_id)
-                except:
-                    init_exception = Exception("init_fn failed in worker {}: " \
-                                         "{}".format(worker_id, sys.exc_info()))
+                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
+                                                      collate_fn, True)
+            except:
+                init_exception = Exception("init_fn failed in worker {}: " \
+                                     "{}".format(worker_id, sys.exc_info()))
 
+            iterator_drained = False
             parent_watch_dog = ParentWatchDog()
 
             while parent_watch_dog.is_alive():
@@ -380,12 +520,12 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
                 # None as poison piil, so worker event should be set
                 if data is None:
-                    assert done_event.is_set(
-                    ), "get None when worker done_event set"
+                    assert done_event.is_set() or iterator_drained, \
+                            "get None when worker done_event set"
                     break
                 # If worker done event is set but get still get data in
                 # indices_queue, remaining data should be get and skipped.
-                if done_event.is_set():
+                if done_event.is_set() or iterator_drained:
                     continue
 
                 idx, indices = data
@@ -394,13 +534,25 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         batch = init_exception
                         init_exception = None
                     else:
-                        batch = [dataset[i] for i in indices]
-                        if self._collate_fn is not None:
-                            batch = self._collate_fn(batch)
+                        batch = fetcher.fetch(indices)
                 except Exception as e:
-                    out_queue.put((idx, e))
+                    if isinstance(
+                            e,
+                            StopIteration) and dataset_kind == _DatasetKind.ITER:
+                        out_queue.put(_IterableDatasetStopIteration(worker_id))
+                        iterator_drained = True
+                    else:
+                        out_queue.put((idx, e))
                 else:
                     if self._use_shared_memory:
+                        # FIXME(dkp): _convert_to_tensor_list only support np.array
+                        #             list now, should support paddle.Tensor list
+                        if isinstance(batch[0][0], paddle.Tensor):
+                            np_batch = []
+                            for sample in batch:
+                                np_batch.append([s.numpy() for s in sample])
+                            batch = np_batch
+
                         tensor_list = core._convert_to_tensor_list(batch)
                         out_queue.put((idx, tensor_list))
                         core._remove_tensor_list_mmap_fds(tensor_list)
@@ -435,7 +587,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                             # serializable, cannot be create in workers
                             for slot in batch:
                                 if not isinstance(slot, core.LoDTensor):
-                                    # self._check_input_array(slot)
                                     tmp = core.LoDTensor()
                                     tmp.set(slot, core.CPUPlace())
                                     slot = tmp
@@ -450,10 +601,33 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         self._rcvd_idx += 1
 
     def _get_data(self):
-        if self._rcvd_idx in self._reorder_dict.keys():
-            return self._reorder_dict.pop(self._rcvd_idx)
-
         while not self._thread_done_event.is_set():
+            # For IterableDataset, batch indices is generated infinitely
+            # for each worker to raise StopIteration, but a StopIteration
+            # raising process will discard a batch indices which is count
+            # in _send_idx but will not increase _rcvd_idx, so we check 
+            # whether the worker is still alive here to skip the discarded
+            # batch indices and increase _rcvd_idx
+            if self._dataset_kind == _DatasetKind.ITER:
+                while self._rcvd_idx < self._send_idx:
+                    info = self._task_infos[self._rcvd_idx]
+                    if len(info) == 2 or self._worker_status[info[0]]:
+                        break
+                    del self._task_infos[self._rcvd_idx]
+                    self._rcvd_idx += 1
+                    self._batches_outstanding -= 1
+                else:
+                    # NOTE: _rcvd_idx and _send_idx only record batches among
+                    #       workers, if batches among workers drained, there
+                    #       may also be data in blocking queue
+                    if self._batches_outstanding < len(self._places):
+                        return None
+                    continue
+
+            if self._rcvd_idx in self._task_infos and \
+                    len(self._task_infos[self._rcvd_idx]) == 2:
+                return self._task_infos.pop(self._rcvd_idx)[1]
+
             try:
                 # [ avoid hang ]: main process may blocking at _reader.read_next when
                 # KeyboardInterrupt, we do following tradeoff:
@@ -491,25 +665,55 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                               "workers' result queue.".format(e))
                 six.reraise(*sys.exc_info())
             else:
+                if self._dataset_kind == _DatasetKind.ITER and isinstance(
+                        data, _IterableDatasetStopIteration):
+                    # if a worker get StopIteraion, we shutdown this worker,
+                    # note that this batch indices to trigger StopIteration
+                    # is discard, outstanding batch number should be decrease
+                    # and another indices should be put for other workers
+                    # may still working.
+                    self._shutdown_worker(data.worker_id)
+                    self._batches_outstanding -= 1
+                    self._try_put_indices()
+                    continue
+
                 idx, batch = data
                 if idx == self._rcvd_idx:
+                    del self._task_infos[idx]
                     return batch
                 else:
-                    self._reorder_dict[idx] = batch
+                    self._task_infos[idx] += (batch, )
                     continue
 
     def _try_put_indices(self):
-        assert self._send_idx - self._rcvd_idx <= self._outstanding_capacity, \
+        assert self._batches_outstanding <= self._outstanding_capacity, \
                     "too many indices have been put to queue"
-        try:
-            indices = next(self._sampler_iter)
-        except StopIteration:
-            return
+        # In multi-process mode for IterableDataset, _try_put_indices will
+        # be called both in main process(for our implement has blocking queue,
+        # and blocking queue read is in main process) and thread, which may
+        # cause error following error
+        #   1. "ValueError: generator already executing" in next(self._sampler_iter)
+        #   2. re-enter in increase _send_idx
+        # add a lock for threading save, for _try_put_indices is only a slight
+        # function which is not in data reading pipeline, this lock almost no
+        # influence on performance
+        with self._thread_lock:
+            try:
+                indices = next(self._sampler_iter)
+            except StopIteration:
+                return
+
+            for i in range(self._num_workers):
+                worker_idx = next(self._workers_idx_cycle)
+                if self._worker_status[worker_idx]:
+                    break
+            else:
+                return
 
-        worker_idx = next(self._workers_idx_cycle)
-        self._indices_queues[worker_idx].put((self._send_idx, indices))
-        self._batches_outstanding += 1
-        self._send_idx += 1
+            self._indices_queues[worker_idx].put((self._send_idx, indices))
+            self._task_infos[self._send_idx] = (worker_idx, )
+            self._batches_outstanding += 1
+            self._send_idx += 1
 
     def __del__(self):
         self._try_shutdown_all()
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index b49ceaddefdef272d34ea9008d57d3f1ced43205..13bb946a5ebca09686fc7f56b2f7c5b068ea3148 100644
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -14,14 +14,15 @@
 
 from __future__ import print_function
 
+from .. import framework
 import paddle.dataset.common
 
-__all__ = ["Dataset"]
+__all__ = ["Dataset", "IterableDataset", "TensorDataset"]
 
 
 class Dataset(object):
     """
-    An abstract class to encapsulates methods and behaviors of datasets.
+    An abstract class to encapsulate methods and behaviors of datasets.
 
     All datasets in map-style(dataset samples can be get by a given key)
     should be a subclass of `paddle.io.Dataset`. All subclasses should
@@ -71,3 +72,206 @@ class Dataset(object):
     def __len__(self):
         raise NotImplementedError("'{}' not implement in class "\
                 "{}".format('__len__', self.__class__.__name__))
+
+
+class IterableDataset(Dataset):
+    """
+    An abstract class to encapsulate methods and behaviors of iterable datasets.
+
+    All datasets in iterable-style (can only get sample one by one sequentially, like
+    a Python iterator) should be a subclass of `paddle.io.IterableDataset`. All subclasses should
+    implement following methods:
+
+    :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :code:`paddle.io.DataLoader`.
+
+    .. note::
+        do not implement :code:`__getitem__` and :code:`__len__` in IterableDataset, should not be called either.
+
+    see :code:`paddle.io.DataLoader`.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import numpy as np
+            from paddle.io import Dataset
+            
+            # define a random dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __iter__(self):
+                    for i in range(self.num_samples):
+                        image = np.random.random([784]).astype('float32')
+                        label = np.random.randint(0, 9, (1, )).astype('int64')
+                        yield image, label
+            
+            dataset = RandomDataset(10)
+            for img, lbl in dataset:
+                print(img, lbl)
+
+    When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
+    will yield whole dataset samples, which means samples in dataset will be repeated in
+    :attr:`num_workers` times. If it is required for each sample to yield only once, there
+    are two methods to configure different copy in each worker process to avoid duplicate data
+    among workers as follows. In both the methods, worker information that can be getted in
+    a worker process by `paddle.io.get_worker_info` will be needed.
+
+    Example 1: splitting data copy in each worker in :code:`__iter__`
+
+        .. code-block:: python
+
+            import math
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class SplitedIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    worker_info = get_worker_info()
+                    if worker_info is None:
+                        iter_start = self.start
+                        iter_end = self.end
+                    else:
+                        per_worker = int(
+                            math.ceil((self.end - self.start) / float(
+                                worker_info.num_workers)))
+                        worker_id = worker_info.id
+                        iter_start = self.start + worker_id * per_worker
+                        iter_end = min(iter_start + per_worker, self.end)
+
+                    for i in range(iter_start, iter_end):
+                        yield np.array([i])
+
+            place = fluid.CPUPlace()
+            with fluid.dygraph.guard(place):
+                dataset = SplitedIterableDataset(start=2, end=9)
+                dataloader = DataLoader(
+                    dataset,
+                    places=place,
+                    num_workers=2,
+                    batch_size=1,
+                    drop_last=True)
+
+                print(list(dataloader))
+                # outputs: [2, 5, 3, 6, 4, 7]
+
+    Example 2: splitting data copy in each worker by :code:`worker_init_fn`
+
+        .. code-block:: python
+
+            import math
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class RangeIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    for i in range(self.start, self.end):
+                        yield np.array([i])
+
+            place = fluid.CPUPlace()
+            with fluid.dygraph.guard(place):
+                dataset = RangeIterableDataset(start=2, end=9)
+
+                def worker_init_fn(worker_id):
+                    worker_info = get_worker_info()
+
+                    dataset = worker_info.dataset
+                    start = dataset.start
+                    end = dataset.end
+                    num_per_worker = int(
+                        math.ceil((end - start) / float(worker_info.num_workers)))
+
+                    worker_id = worker_info.id
+                    dataset.start = start + worker_id * num_per_worker
+                    dataset.end = min(dataset.start + num_per_worker, end)
+
+                dataloader = DataLoader(
+                    dataset,
+                    places=place,
+                    num_workers=2,
+                    batch_size=1,
+                    drop_last=True,
+                    worker_init_fn=worker_init_fn)
+
+                print(list(dataloader))
+                # outputs: [2, 5, 3, 6, 4, 7]
+
+    """
+
+    def __init__(self):
+        pass
+
+    def __iter__(self):
+        raise NotImplementedError("'{}' not implement in class "\
+                "{}".format('__iter__', self.__class__.__name__))
+
+    def __getitem__(self, idx):
+        raise RuntimeError("'{}' should not be called for IterableDataset" \
+                "{}".format('__getitem__', self.__class__.__name__))
+
+    def __len__(self):
+        raise RuntimeError("'{}' should not be called for IterableDataset" \
+                "{}".format('__len__', self.__class__.__name__))
+
+
+class TensorDataset(Dataset):
+    """
+    Dataset defined by a list of tensors.
+
+    Each tensor should be in shape of [N, ...], while N is the sample number,
+    and ecah tensor contains a field of sample, :code:`TensorDataset` retrieve
+    each sample by indexing tensors in the 1st dimension.
+
+    Args:
+        tensors(list of Tensor): tensors with same shape in the 1st dimension.
+
+    Returns:
+        Dataset: a Dataset instance wrapping tensors.
+
+    Examples:
+
+        .. code-block:: python
+        
+            import numpy as np
+            import paddle
+            from paddle.io import TensorDataset
+
+            paddle.disable_static()
+
+            input_np = np.random.random([2, 3, 4]).astype('float32')
+            input = paddle.to_tensor(input_np)
+            label_np = np.random.random([2, 1]).astype('int32')
+            label = paddle.to_tensor(label_np)
+
+            dataset = TensorDataset([input, label])
+
+            for i in range(len(dataset)):
+                input, label = dataset[i]
+                print(input, label)
+
+    """
+
+    def __init__(self, tensors):
+        if not framework.in_dygraph_mode():
+            raise RuntimeError(
+                "TensorDataset con only be used in imperative mode")
+        assert all([tensor.shape[0] == tensors[0].shape[0] for tensor in tensors]), \
+                "tensors not have same shape of the 1st dimension"
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        return tuple(tensor[index] for tensor in self.tensors)
+
+    def __len__(self):
+        return self.tensors[0].shape[0]
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..001b8b931da233557c5d0b11af283a6e631788ae
--- /dev/null
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class _DatasetFetcher(object):
+    def __init__(self, dataset, collate_fn, drop_last):
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.drop_last = drop_last
+
+    def fetch(self, batch_indices):
+        raise NotImplementedError("'fetch' not implement for class {}".format(
+            self.__class__.__name__))
+
+
+class _IterableDatasetFetcher(_DatasetFetcher):
+    def __init__(self, dataset, collate_fn, drop_last):
+        super(_IterableDatasetFetcher, self).__init__(dataset, collate_fn,
+                                                      drop_last)
+        self.dataset_iter = iter(dataset)
+
+    def fetch(self, batch_indices):
+        data = []
+        for _ in batch_indices:
+            try:
+                data.append(next(self.dataset_iter))
+            except StopIteration:
+                break
+        if len(data) == 0 or (self.drop_last and
+                              len(data) < len(batch_indices)):
+            raise StopIteration
+
+        return self.collate_fn(data)
+
+
+class _MapDatasetFetcher(_DatasetFetcher):
+    def __init__(self, dataset, collate_fn, drop_last):
+        super(_MapDatasetFetcher, self).__init__(dataset, collate_fn, drop_last)
+
+    def fetch(self, batch_indices):
+        data = [self.dataset[idx] for idx in batch_indices]
+        return self.collate_fn(data)
diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c75fafe8b22380090ba6fb580777cdbe6570ad6
--- /dev/null
+++ b/python/paddle/fluid/dataloader/sampler.py
@@ -0,0 +1,236 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+
+__all__ = ["Sampler", "SequenceSampler", "RandomSampler"]
+
+
+class Sampler(object):
+    """
+    An abstract class to encapsulate methods and behaviors of samplers.
+
+    All sampler used by :code:`paddle.io.BatchSampler` should be a subclass
+    of :code:`paddle.io.Sampler`, BatchSampler subclasses should
+    implement following methods:
+
+    :code:`__iter__`: return sample index iterably, which iterate over indices
+    of dataset elements
+
+    :code:`__len__`: the number of sample in :attr:`data_source`
+
+
+    Args:
+        data_source(Dataset, optional): this could be an instance of
+                :code:`paddle.io.Dataset` other Python object which
+                implemented :code:`__len__` for Sampler to get indices
+                as the range of :attr:`dataset` length. Default None.
+
+    Returns:
+        Sampler: an iterable object for sample indices iterating
+
+    Examples:
+        
+        .. code-block:: python
+            
+            from paddle.io import Dataset, Sampler
+
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            class MySampler(Sampler):
+                def __init__(self, data_source):
+                    self.data_source = data_source
+
+                def __iter__(self):
+                    return iter(range(len(self.data_source)))
+
+                def __len__(self):
+                    return len(self.data_source)
+            
+            sampler = MySampler(data_source=RandomDataset(100))
+
+            for index in sampler:
+                print(index)
+
+    see `paddle.io.BatchSampler`
+    see `paddle.io.DataLoader`
+
+    """
+
+    def __init__(self, data_source=None):
+        self.data_source = data_source
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    # Not define __len__ method in this base class here for __len__
+    # is not needed in same sence, e.g. paddle.io.IterableDataset
+
+
+class SequenceSampler(Sampler):
+    """
+    Iterate samples sequentially, yield :code:`0, 1, 2, ..., len(data_source) -1`
+    generally,
+
+    Args:
+        data_source(Dataset): dataset to sample, this could be an
+                instance of :code:`paddle.io.Dataset` other Python
+                object which implemented :code:`__len__`.
+
+    Returns:
+        Sampler: a Sampler yield sample index sequentially
+
+    Examples:
+
+        .. code-block:: python
+            
+            from paddle.io import Dataset, SequenceSampler
+
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            sampler = SequenceSampler(data_source=RandomDataset(100))
+
+            for index in sampler:
+                print(index)
+
+    see `paddle.io.Sampler`
+    """
+
+    def __init__(self, data_source):
+        self.data_source = data_source
+
+    def __iter__(self):
+        return iter(range(len(self.data_source)))
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class RandomSampler(Sampler):
+    """
+    Iterate samples randomly, yield shuffled indices, if :attr:`replacement=False`,
+    yield shuffled indices of the whole data souce, if :attr:`replacement=True`,
+    :attr:`num_samples` can set to specify the sample number to draw.
+
+    Args:
+        data_source(Dataset): dataset to sample, this could be an
+                instance of :code:`paddle.io.Dataset` other Python
+                object which implemented :code:`__len__`.
+        replacement(bool): If False, sample the whole dataset, If False,
+                set :attr:`num_samples` for how many sample to draw. Default False.
+        num_samples(int): set sample number to draw if :attr:`replacement`
+                is True. Default None.
+        generator(Generator): specify a generator to sample the data source. Default None
+        
+    Returns:
+        Sampler: a Sampler yield sample index randomly
+
+    Examples:
+
+        .. code-block:: python
+            
+            from paddle.io import Dataset, RandomSampler
+
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            sampler = RandomSampler(data_source=RandomDataset(100))
+
+            for index in sampler:
+                print(index)
+
+    see `paddle.io.Sampler`
+    """
+
+    def __init__(self,
+                 data_source,
+                 replacement=False,
+                 num_samples=None,
+                 generator=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+
+        if not isinstance(self.replacement, bool):
+            raise TypeError("expect boolean value for replacement, but got "
+                            "replacement={}".format(self.replacement))
+
+        if self._num_samples is not None and not replacement:
+            raise ValueError(
+                "num_samples should not be specified while replacement is False")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer, "
+                             "but got num_samples={}".format(self.num_samples))
+
+    @property
+    def num_samples(self):
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        if self.generator:
+            for i in range(self.num_samples):
+                try:
+                    index = next(self.generator)
+                except StopIteration:
+                    return
+                yield index
+        else:
+            if self.replacement:
+                for index in np.random.choice(
+                        np.arange(n), self.num_samples, replace=True).tolist():
+                    yield index
+            else:
+                for index in np.random.choice(
+                        np.arange(n), n, replace=False).tolist():
+                    yield index
+
+    def __len__(self):
+        return self.num_samples
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index f990d02342be78fe998cebfa40ed8b348cf54b2a..cf270ced3b704179856b1ab04dbeae8a04fbc589 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -38,9 +38,6 @@ from .checkpoint import *
 from . import learning_rate_scheduler
 from .learning_rate_scheduler import *
 
-from . import backward_strategy
-from .backward_strategy import *
-
 from . import jit
 from .jit import *
 
@@ -56,6 +53,11 @@ from .dygraph_to_static import ProgramTranslator
 from . import rnn
 from .rnn import *
 
+from . import amp
+from .amp import *
+
+from .math_op_patch import monkey_patch_math_varbase
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
@@ -64,8 +66,8 @@ __all__ += nn.__all__
 __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
-__all__ += backward_strategy.__all__
 __all__ += jit.__all__
 __all__ += io.__all__
 __all__ += rnn.__all__
 __all__ += ['ProgramTranslator']
+__all__ += amp.__all__
diff --git a/python/paddle/imperative/jit/__init__.py b/python/paddle/fluid/dygraph/amp/__init__.py
similarity index 77%
rename from python/paddle/imperative/jit/__init__.py
rename to python/paddle/fluid/dygraph/amp/__init__.py
index 85fccf6e689ebf606092df8c3f94f561a68705ed..e86c5a20c5a411fda2a0011f63f4b5254e9bd07a 100644
--- a/python/paddle/imperative/jit/__init__.py
+++ b/python/paddle/fluid/dygraph/amp/__init__.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.dygraph.jit import save, load, SaveLoadConfig
-from ...fluid.dygraph.io import TranslatedLayer
+from . import auto_cast
+from .auto_cast import *
 
-__all__ = ['save', 'load', 'SaveLoadConfig']
+from . import loss_scaler
+from .loss_scaler import *
+
+__all__ = []
+__all__ += auto_cast.__all__
+__all__ += loss_scaler.__all__
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffb4d9f16f29f384b83f175ddcb60f65e8077930
--- /dev/null
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+from paddle.fluid import core
+import contextlib
+from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
+import warnings
+import copy
+
+__all__ = ['amp_guard']
+
+# The set of ops that support fp16 calculation and are considered numerically-
+# safe and performance-critical. These ops are always converted to fp16.
+WHITE_LIST = {
+    'conv2d',
+    'matmul',
+    'mul',
+}
+
+# The set of ops that support fp16 calculation and are considered numerically-
+# dangerous and whose effects may also be observed in downstream ops.
+BLACK_LIST = {
+    'exp',
+    'square',
+    'log',
+    'mean',
+    'sum',
+    'cos_sim',
+    'softmax',
+    'softmax_with_cross_entropy',
+    'sigmoid_cross_entropy_with_logits',
+    'cross_entropy',
+    'cross_entropy2',
+}
+
+AMP_RELATED_FLAGS = [
+    'FLAGS_cudnn_exhaustive_search',
+    'FLAGS_conv_workspace_size_limit',
+    'FLAGS_cudnn_batchnorm_spatial_persistent',
+]
+
+AMP_RELATED_FLAGS_SETTING = {
+    'FLAGS_cudnn_exhaustive_search': 1,
+    'FLAGS_conv_workspace_size_limit': 1000,
+    'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+}
+
+
+#NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
+# The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
+def _update_list(custom_white_list, custom_black_list):
+    """
+    Update black and white list according to users' custom list.
+    """
+    _white_list = copy.copy(WHITE_LIST)
+    _black_list = copy.copy(BLACK_LIST)
+    if custom_white_list and custom_black_list:
+        for op_name in custom_white_list:
+            if op_name in custom_black_list:
+                raise ValueError("Custom white list overlap "
+                                 "custom black list")
+    if custom_white_list:
+        for op_name in custom_white_list:
+            if op_name in _black_list:
+                _black_list.remove(op_name)
+            _white_list.add(op_name)
+    if custom_black_list:
+        for op_name in custom_black_list:
+            if op_name in _white_list:
+                _white_list.remove(op_name)
+            _black_list.add(op_name)
+    return _white_list, _black_list
+
+
+@signature_safe_contextmanager
+@dygraph_only
+def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
+    """
+    :api_attr: imperative
+
+    Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode.
+    If enabled, the input data type (float32 or float16) of each operator is decided 
+    by autocast algorithm for better performance. 
+    
+    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
+    imperative mode.
+
+    Args:
+        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
+        custom_white_list(set|list, optional): The custom white_list.
+        custom_black_list(set|list, optional): The custom black_list.
+        
+    Examples:
+
+     .. code-block:: python
+
+        import numpy as np
+        import paddle.fluid as fluid
+
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard():
+                conv = conv2d(data)
+                print(conv.dtype) # FP16
+            with fluid.dygraph.amp_guard(enable=False):
+                conv = conv2d(data)
+                print(conv.dtype) # FP32
+
+    """
+    tracer = _dygraph_tracer()
+    if not tracer:
+        raise ValueError(
+            "current_tracer is None, maybe it is not in imperative mode.")
+
+    if enable and not tracer._expected_place.is_gpu_place():
+        warnings.warn(
+            'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
+            % tracer._expected_place)
+        enable = False
+
+    # use default white_list and black_list if no custom lists provided
+    _white_list = WHITE_LIST
+    _black_list = BLACK_LIST
+    if custom_white_list or custom_black_list:
+        _white_list, _black_list = _update_list(custom_white_list,
+                                                custom_black_list)
+
+    if tracer:
+        # enable auto_cast
+        original_enable = tracer._enable_autocast
+        tracer._enable_autocast = enable
+        # set amp op list
+        original_white_list, original_black_list = tracer._get_amp_op_list()
+        tracer._set_amp_op_list(_white_list, _black_list)
+
+        # TODO(zhiqiu) set amp related flags automatically in this guard
+        # Currently, if FLAGS_cudnn_batchnorm_spatial_persistent is set True in amp_guard,
+        # batch_norm can run in fast mode, but batch_norm_grad can not if backward if not executed insise amp_guard.
+        # So, users need to set related flags manually.
+
+        # original_flags = get_flags(AMP_RELATED_FLAGS)
+        # set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    # restore status
+    try:
+        yield
+    finally:
+        if tracer:
+            tracer._enable_autocast = original_enable
+            tracer._set_amp_op_list(original_white_list, original_black_list)
+            # set_flags(original_flags)
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f3ca9ec007ef5c1ab8769dde741a5d2b3697600
--- /dev/null
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -0,0 +1,246 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from paddle.fluid import core
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import _varbase_creator, _dygraph_tracer, dygraph_only
+from paddle.fluid.data_feeder import check_type
+from ...wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+import warnings
+import numpy as np
+
+__all__ = ['AmpScaler']
+
+
+class AmpScaler(object):
+    """
+    :api_attr: imperative
+
+    AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
+    mode. It controls the scaling of loss, helps avoiding numerical overflow.
+    The object of this class has two methods `scale()`, `minimize()`.
+
+    `scale()` is used to multiply the loss by a scale ratio.
+    `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
+
+    Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in 
+    imperative mode.
+
+    Args:
+        enable(bool, optional): Enable loss scaling or not. Default is True.
+        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
+        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+                        scaling. Default is 2.0.
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+                        the loss scaling. Default is 0.5.
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+                                steps with finite gradients. Default is 1000.
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+                                    accumulated steps with nan or inf gradients. Default is 2.
+        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+    Returns:
+        An AmpScaler object.
+
+    Examples:
+
+     .. code-block:: python
+
+        import numpy as np
+        import paddle.fluid as fluid
+
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            model = fluid.dygraph.Conv2D(3, 2, 3)
+            optimizer = fluid.optimizer.SGDOptimizer(
+                    learning_rate=0.01, parameter_list=model.parameters())
+            scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard():
+                conv = model(data)
+                loss = fluid.layers.reduce_mean(conv)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.minimize(optimizer, scaled)         
+    """
+
+    @dygraph_only
+    def __init__(self,
+                 enable=True,
+                 init_loss_scaling=2.**15,
+                 incr_ratio=2.0,
+                 decr_ratio=0.5,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=1,
+                 use_dynamic_loss_scaling=True):
+
+        tracer = _dygraph_tracer()
+        if not tracer:
+            raise ValueError(
+                "current_tracer is None, maybe it is not in imperative mode.")
+
+        if enable and not tracer._expected_place.is_gpu_place():
+            warnings.warn(
+                'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
+                % tracer._expected_place)
+            enable = False
+
+        self._enable = enable
+
+        if self._enable:
+            assert incr_ratio > 1.0, "The incr_ratio must be > 1.0."
+            assert decr_ratio < 1.0, "The decr_ratio must be < 1.0."
+
+            self._init_loss_scaling = init_loss_scaling
+            self._incr_ratio = incr_ratio
+            self._decr_ratio = decr_ratio
+            self._incr_every_n_steps = incr_every_n_steps
+            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
+            self._incr_count = 0
+            self._decr_count = 0
+            self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
+
+            self._found_inf = to_variable(np.array([0]).astype(np.bool))
+            self._scale = to_variable(
+                np.array([self._init_loss_scaling]).astype(np.float32))
+            self._cache_founf_inf = None
+
+    def scale(self, var):
+        """
+        Multiplies a variable(Tensor) by the scale factor and returns scaled outputs.  
+        If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified.
+
+        Args:
+            var (Variable):  The variable to scale.
+        Returns:
+            The scaled variable or original variable.
+        
+        Examples:
+            .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+
+            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3)
+                optimizer = fluid.optimizer.SGDOptimizer(
+                        learning_rate=0.01, parameter_list=model.parameters())
+                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(data)
+                with fluid.dygraph.amp_guard():
+                    conv = model(data)
+                    loss = fluid.layers.reduce_mean(conv)
+                    scaled = scaler.scale(loss)
+                    scaled.backward()
+                    scaler.minimize(optimizer, scaled) 
+        """
+        check_type(var, "var", core.VarBase, 'AmpScaler.scale()')
+
+        if not self._enable:
+            return var
+
+        return var * self._scale
+
+    def minimize(self, optimizer, *args, **kwargs):
+        """
+        This function is similar as `Optimizer.minimize()`, which performs parameters updating.
+        
+        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
+        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+
+        Finally, the loss scaling ratio is updated.
+
+        Args:
+            optimizer(Optimizer):  The optimizer used to update parameters.
+            args:  Arguments, which will be forward to `optimizer.minimize()`.
+            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
+
+        Examples:
+            .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+
+            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3)
+                optimizer = fluid.optimizer.SGDOptimizer(
+                        learning_rate=0.01, parameter_list=model.parameters())
+                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(data)
+                with fluid.dygraph.amp_guard():
+                    conv = model(data)
+                    loss = fluid.layers.reduce_mean(conv)
+                    scaled = scaler.scale(loss)
+                    scaled.backward()
+                    scaler.minimize(optimizer, scaled) 
+        """
+        if not self._enable:
+            return optimizer.minimize(*args, **kwargs)
+
+        #  unscale the grad
+        self._unscale(optimizer)
+
+        optimize_ops, params_grads = (None, None)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            # uopdate the scale
+            self._update()
+
+        return optimize_ops, params_grads
+
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+        inv_scale = 1.0 / self._scale
+        param_grads = [
+            param._grad_ivar() for param in optimizer._parameter_list
+            if param._grad_ivar() is not None
+        ]
+        core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
+                                            self._found_inf)
+
+    def _update(self):
+        """
+        Updates the loss_scaling.
+        """
+        if not self._enable:
+            return
+
+        if self._cache_founf_inf:
+            self._incr_count = 0
+            self._decr_count = self._decr_count + 1
+            if self._decr_count == self._decr_every_n_nan_or_inf:
+                print(
+                    'Found inf or nan, current scale is: {}, decrease to: {}*{}'.
+                    format(
+                        float(self._scale),
+                        float(self._scale), float(self._decr_ratio)))
+                self._scale = self._scale * self._decr_ratio
+                self._decr_count = 0
+        else:
+            self._decr_count = 0
+            self._incr_count = self._incr_count + 1
+            if self._incr_count == self._incr_every_n_steps:
+                self._scale = self._scale * self._incr_ratio
+                self._incr_count = 0
+
+        return
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 7d972cbbd09b95e5d7476837cb3f3318526deed8..0c4a1964838c608fc5dd46a1dfb16d3d3d7b6ed9 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+import inspect
 import decorator
 import contextlib
-import functools
 import sys
 import numpy as np
 from paddle.fluid import core
@@ -26,13 +26,8 @@ import objgraph
 from ..data_feeder import convert_dtype
 
 __all__ = [
-    'no_grad',
-    'grad',
-    'guard',
-    'enable_dygraph',
-    'disable_dygraph',
-    'enabled',
-    'to_variable',
+    'no_grad', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', 'enabled',
+    'to_variable'
 ]
 
 
@@ -96,8 +91,8 @@ def enabled():
     """
     This function checks whether the program runs in dynamic graph mode or not.
     You can enter dynamic graph mode with :ref:`api_fluid_dygraph_guard` api,
-    or enable and disable dynamic graph mode with :ref:`api_fluid_dygraph_enable`
-    and :ref:`api_fluid_dygraph_disable` api .
+    or enable and disable dynamic graph mode with :ref:`api_fluid_dygraph_enable_dygraph`
+    and :ref:`api_fluid_dygraph_disable_dygraph` api .
 
     **Note**:
         ``fluid.dygraph.enabled`` is the alias of ``fluid.in_dygraph_mode``, and
@@ -121,10 +116,6 @@ def enabled():
 
 def enable_dygraph(place=None):
     """
-    :alias_main: paddle.enable_dygraph
-	:alias: paddle.enable_dygraph,paddle.enable_imperative.enable_dygraph
-	:old_api: paddle.fluid.dygraph.base.enable_dygraph
-
     This function enables dynamic graph mode.
 
     Parameters:
@@ -155,10 +146,6 @@ def enable_dygraph(place=None):
 
 def disable_dygraph():
     """
-    :alias_main: paddle.disable_dygraph
-	:alias: paddle.disable_dygraph,paddle.disable_imperative.disable_dygraph
-	:old_api: paddle.fluid.dygraph.base.disable_dygraph
-
     This function disables dynamic graph mode.
 
     return:
@@ -180,77 +167,82 @@ def disable_dygraph():
         _functional_dygraph_context_manager = None
 
 
-@signature_safe_contextmanager
-def _switch_tracer_mode_guard_(is_train=True):
-    tracer = framework._dygraph_tracer()
-    if tracer:
-        mode = tracer._train_mode
-        tracer._train_mode = is_train
-        try:
-            yield
-        finally:
-            tracer._train_mode = mode
-    else:
-        yield
-
-
-def no_grad(func=None):
+class no_grad:
     """
     :api_attr: imperative
 
     Create a context which disables dygraph gradient calculation.
-    In this mode, the result of every computation will have `stop_gradient=True`.
+    In this mode, the result of every computation will have `stop_gradient` set
+    to `True`.
 
-    Also functions as a decorator. (Make sure to instantiate without parenthesis.)
+    Also functions as a decorator. (Make sure to use an instance.)
 
     Examples:
 
      .. code-block:: python
 
         import numpy as np
-        import paddle.fluid as fluid
+        import paddle
+
+        paddle.disable_static()
 
         # use as generator
 
         data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with fluid.dygraph.guard():
-            l0 = fluid.Linear(2, 2)  # l0.weight.gradient() is None
-            l1 = fluid.Linear(2, 2)
-            with fluid.dygraph.no_grad():
-                # l1.weight.stop_gradient is False
-                tmp = l1.weight * 2  # tmp.stop_gradient is True
-            x = fluid.dygraph.to_variable(data)
-            y = l0(x) + tmp
-            o = l1(y)
-            o.backward()
-            print(tmp.gradient() is None)  # True
-            print(l0.weight.gradient() is None)  # False
+        l0 = paddle.nn.Linear(2, 2)  # l0.weight.gradient() is None
+        l1 = paddle.nn.Linear(2, 2)
+        with paddle.no_grad():
+            # l1.weight.stop_gradient is False
+            tmp = l1.weight * 2  # tmp.stop_gradient is True
+        x = paddle.to_tensor(data)
+        y = l0(x) + tmp
+        o = l1(y)
+        o.backward()
+        print(tmp.gradient() is None)  # True
+        print(l0.weight.gradient() is None)  # False
 
         # use as decorator
 
-        @fluid.dygraph.no_grad
+        @paddle.no_grad()
         def test_layer():
-            with fluid.dygraph.guard():
-                inp = np.ones([3, 1024], dtype='float32')
-                t = fluid.dygraph.base.to_variable(inp)
-                linear1 = fluid.Linear(1024, 4, bias_attr=False)
-                linear2 = fluid.Linear(4, 4)
-                ret = linear1(t)
-                dy_ret = linear2(ret)
+            inp = np.ones([3, 1024], dtype='float32')
+            t = paddle.to_tensor(inp)
+            linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
+            linear2 = paddle.nn.Linear(4, 4)
+            ret = linear1(t)
+            dy_ret = linear2(ret)
 
         test_layer()
-
     """
-    if func is None:
-        return _switch_tracer_mode_guard_(is_train=False)
-    else:
 
+    def __call__(self, func):
         @decorator.decorator
-        def __impl__(func, *args, **kwargs):
-            with _switch_tracer_mode_guard_(is_train=False):
+        def _decorate_function(func, *args, **kwargs):
+            with self:
                 return func(*args, **kwargs)
 
-        return __impl__(func)
+        @decorator.decorator
+        def _decorate_generator(func, *args, **kwargs):
+            gen = func(*args, **kwargs)
+            with self:
+                for x in gen:
+                    yield x
+
+        if inspect.isgeneratorfunction(func):
+            return _decorate_generator(func)
+        else:
+            return _decorate_function(func)
+
+    def __enter__(self):
+        tracer = framework._dygraph_tracer()
+        if tracer:
+            self.orig = tracer._train_mode
+            tracer._train_mode = False
+
+    def __exit__(self, *args):
+        tracer = framework._dygraph_tracer()
+        if tracer:
+            tracer._train_mode = self.orig
 
 
 @signature_safe_contextmanager
@@ -288,12 +280,11 @@ def guard(place=None):
     tracer = Tracer()
     VarBase = core.VarBase
 
-    if place is None:
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-    tracer._expected_place = place
+    if place is not None:
+        expected_place = place
+    else:
+        expected_place = framework._current_expected_place()
+    tracer._expected_place = expected_place
 
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
@@ -328,8 +319,7 @@ def grad(outputs,
          create_graph=False,
          only_inputs=True,
          allow_unused=False,
-         no_grad_vars=None,
-         backward_strategy=None):
+         no_grad_vars=None):
     ''' 
     .. note::
         **This API is ONLY available in Dygraph mode.**
@@ -372,9 +362,6 @@ def grad(outputs,
             their gradients if allow_unused=True. Default False.
         no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional): 
             the Variables whose gradients are not needed to compute. Default None.
-        backward_strategy (BackwardStrategy, optional): The backward strategy to
-            compute gradients. See :ref:`api_fluid_dygraph_BackwardStrategy` for
-            details. Default None.
 
     Returns:
         tuple: a tuple of Variables, whose length is the same as the Variable number 
@@ -384,47 +371,46 @@ def grad(outputs,
     Examples 1:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.disable_static()
 
             def test_dygraph_grad(create_graph):
-                with fluid.dygraph.guard(): 
-                    x = fluid.layers.ones(shape=[1], dtype='float32') 
-                    x.stop_gradient = False
-                    y = x * x
-
-                    # Since y = x * x, dx = 2 * x 
-                    dx = fluid.dygraph.grad(
-                            outputs=[y],
-                            inputs=[x], 
-                            create_graph=create_graph, 
-                            retain_graph=True)[0]
-
-                    z = y + dx
-
-                    # If create_graph = False, the gradient of dx
-                    # would not be backpropagated. Therefore,
-                    # z = x * x + dx, and x.gradient() = 2 * x = 2.0
-                    
-                    # If create_graph = True, the gradient of dx
-                    # would be backpropagated. Therefore, 
-                    # z = x * x + dx = x * x + 2 * x, and
-                    # x.gradient() = 2 * x + 2 = 4.0 
-
-                    z.backward()
-                    return x.gradient() 
-
-            print(test_dygraph_grad(create_graph=False)) # [2.] 
+                x = paddle.ones(shape=[1], dtype='float32')
+                x.stop_gradient = False
+                y = x * x
+
+                # Since y = x * x, dx = 2 * x
+                dx = paddle.grad(
+                        outputs=[y],
+                        inputs=[x],
+                        create_graph=create_graph,
+                        retain_graph=True)[0]
+
+                z = y + dx
+
+                # If create_graph = False, the gradient of dx
+                # would not be backpropagated. Therefore,
+                # z = x * x + dx, and x.gradient() = 2 * x = 2.0
+
+                # If create_graph = True, the gradient of dx
+                # would be backpropagated. Therefore,
+                # z = x * x + dx = x * x + 2 * x, and
+                # x.gradient() = 2 * x + 2 = 4.0
+
+                z.backward()
+                return x.gradient()
+
+            print(test_dygraph_grad(create_graph=False)) # [2.]
             print(test_dygraph_grad(create_graph=True)) # [4.]
 
     Examples 2:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            fluid.enable_dygraph()
+            import paddle
+            paddle.disable_static()
 
             def test_dygraph_grad(grad_outputs=None):
-                x = fluid.layers.fill_constant(shape=[1], value=2.0, dtype='float32')
+                x = paddle.fill_constant(shape=[1], value=2.0, dtype='float32')
                 x.stop_gradient = False
 
                 y1 = x * x
@@ -440,27 +426,27 @@ def grad(outputs,
                 # Therefore, the final result would be:
                 # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.
 
-                dx = fluid.dygraph.grad(
+                dx = paddle.grad(
                     outputs=[y1, y2], 
                     inputs=[x],
                     grad_outputs=grad_outputs)[0]
 
                 return dx.numpy()
 
-            THREE = fluid.layers.fill_constant(shape=[1], value=3.0, dtype='float32')
-            FOUR = fluid.layers.fill_constant(shape=[1], value=4.0, dtype='float32')
+            grad_value = paddle.fill_constant(shape=[1], value=4.0, dtype='float32')
 
             # dy1 = [1], dy2 = [1]
             print(test_dygraph_grad(None)) # [7.]
 
             # dy1 = [1], dy2 = [4]
-            print(test_dygraph_grad([None, FOUR])) # [16.] 
+            print(test_dygraph_grad([None, grad_value])) # [16.]
 
             # dy1 = [4], dy2 = [1]
-            print(test_dygraph_grad([FOUR, None])) # [19.]
+            print(test_dygraph_grad([grad_value, None])) # [19.]
 
             # dy1 = [3], dy2 = [4]
-            print(test_dygraph_grad([THREE, FOUR])) # [24.]
+            grad_y1 = paddle.fill_constant(shape=[1], value=3.0, dtype='float32')
+            print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
 	'''
 
     def check_in_out(in_out_list, name):
@@ -513,12 +499,6 @@ def grad(outputs,
         raise AssertionError(
             "no_grad_vars must be None, Variable or list/tuple/set of Variables")
 
-    if backward_strategy is None:
-        backward_strategy = core.BackwardStrategy()
-
-    assert isinstance(backward_strategy, core.BackwardStrategy), \
-        "backward_strategy must be type paddle.fluid.dygraph.BackwardStrategy"
-
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
     if retain_graph is None:
@@ -534,9 +514,9 @@ def grad(outputs,
 
     place = core.Place()
     place.set_place(framework._current_expected_place())
-    return core.dygraph_partial_grad(
-        inputs, outputs, grad_outputs, no_grad_vars, place, backward_strategy,
-        create_graph, retain_graph, allow_unused, only_inputs)
+    return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
+                                     no_grad_vars, place, create_graph,
+                                     retain_graph, allow_unused, only_inputs)
 
 
 @framework.dygraph_only
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py b/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
index 1f91027e462d3437b0ef01455aa037cb38d8b658..9608910ee8d6223ea8e7bab06d5db90632cc2be0 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
@@ -34,6 +34,9 @@ from .convert_call_func import *
 
 from . import convert_operators
 
+from . import logging_utils
+from .logging_utils import *
+
 __all__ = []
 __all__ += ast_transformer.__all__
 __all__ += loop_transformer.__all__
@@ -41,3 +44,4 @@ __all__ += static_analysis.__all__
 __all__ += variable_trans_func.__all__
 __all__ += program_translator.__all__
 __all__ += convert_call_func.__all__
+__all__ += logging_utils.__all__
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index f859d40050c73d276bf9940d904c656debd35c82..5152799ca72f1461d6fbfc3a619a6aa9b9477934 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
 import gast
-
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
@@ -31,14 +30,16 @@ from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTr
 from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import LoopTransformer
 from paddle.fluid.dygraph.dygraph_to_static.print_transformer import PrintTransformer
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import ReturnTransformer
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer
 
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
 
 __all__ = ['DygraphToStaticAst']
 
-DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func']
+DECORATOR_NAMES = ['declarative', 'to_static', 'dygraph_to_static_func']
 
 
 class DygraphToStaticAst(gast.NodeTransformer):
@@ -57,45 +58,70 @@ class DygraphToStaticAst(gast.NodeTransformer):
         return self.static_analysis_root
 
     def transfer_from_node_type(self, node_wrapper):
+        translator_logger = logging_utils.TranslatorLogger()
+        translator_logger.log(
+            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
         # Transform basic api of dygraph to static graph and get feed_name_to_arg_name
-        basic_api_trans = BasicApiTransformer(node_wrapper)
-        basic_api_trans.transform()
+        BasicApiTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(1, self.root,
+                                               "BasicApiTransformer")
 
         # Transform Tensor.shape into fluid.layers.shape(Tensor)
         TensorShapeTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(2, self.root,
+                                               "TensorShapeTransformer")
 
         # Transform list used in control flow
         ListTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(3, self.root, "ListTransformer")
 
         # Transform break/continue in loops
         BreakContinueTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(4, self.root,
+                                               "BreakContinueTransformer")
 
         # Transform return in functions
         ReturnTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(5, self.root,
+                                               "ReturnTransformer")
 
         # Transform logical and/or/not
         LogicalTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(6, self.root,
+                                               "LogicalTransformer")
 
         # Transform for loop and while loop
         LoopTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(7, self.root, "LoopTransformer")
 
         # Transform all if/else statement of Dygraph into Static Graph.
         IfElseTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(8, self.root,
+                                               "IfElseTransformer")
 
         # Transform python assert statement
         AssertTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(9, self.root,
+                                               "AssertTransformer")
 
         # Transform all python print statement
         PrintTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(10, self.root,
+                                               "PrintTransformer")
 
         # Transform call recursively
         CallTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(11, self.root, "CallTransformer")
 
         # Transform python type casting statement
         CastTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(12, self.root, "CastTransformer")
+
+        translator_logger.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                               self.root, "All Transformers")
 
     def visit_FunctionDef(self, node):
         if self.decorate_func_name is None:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index 4ba1d302576df695c5b2e867452b91b3d1d2844a..7fc72d42759b0f8029ac6adfc7b9670fbffc67d5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -19,6 +19,8 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrappe
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_api
 
+PDB_SET = "pdb.set_trace"
+
 
 class CallTransformer(gast.NodeTransformer):
     """
@@ -62,6 +64,12 @@ class CallTransformer(gast.NodeTransformer):
             return node
 
         func_str = ast_to_source_code(node.func).strip()
+
+        # NOTE(liym27): Don't convert `pad.set_trace` even if the convertion doesn't work finally, because
+        # it is clearer to see where it is called from.
+        if PDB_SET in func_str:
+            return node
+
         new_func_str = "fluid.dygraph.dygraph_to_static.convert_call({})".format(
             func_str)
         new_func_ast = gast.parse(new_func_str).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index edd7dfcf93977bb3244c0c1676715a65dba88dc2..4630cfcdabfd307ea03a7fd0c885c73ce4a4ea0b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -27,13 +27,16 @@ import types
 import numpy
 import six
 
-from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
-from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
+from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
+from paddle.fluid.dygraph.layers import Layer
 
-DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func']
-program_translator = ProgramTranslator()
-to_static_func = program_translator.get_func
+# TODO(liym27): A better way to do this.
+BUILTIN_LIKELY_MODULES = [collections, pdb, copy, inspect, re, six, numpy]
+
+translator_logger = TranslatorLogger()
 
 
 def is_builtin(func):
@@ -41,11 +44,6 @@ def is_builtin(func):
         return True
     elif func in six.moves.builtins.__dict__.values():
         return True
-    # Other built-in modules
-    # TODO(liym27): A better way to do this.
-    elif any(func in m.__dict__.values()
-             for m in (collections, pdb, copy, inspect, re, six, numpy)):
-        return True
     else:
         return False
 
@@ -61,9 +59,29 @@ def is_paddle_func(func):
     return m is not None and m.__name__.startswith("paddle")
 
 
+def is_unsupported(func):
+    """
+    Checks whether the func is supported by dygraph to static graph.
+    """
+
+    if any(func in m.__dict__.values() for m in BUILTIN_LIKELY_MODULES):
+        translator_logger.log(
+            2,
+            "Whitelist: {} is part of built-in module and does not have to be transformed.".
+            format(func))
+        return True
+
+    if is_paddle_func(func):
+        translator_logger.log(
+            2,
+            "Whitelist: {} is part of Paddle module and does not have to be transformed.".
+            format(func))
+        return True
+
+
 def convert_call(func):
     """
-    Converts a function call which needs to be transformed to static fucntion.
+    Converts a function call which needs to be transformed to static function.
 
     Args:
         func (callable): A callable function or method to convert.
@@ -95,13 +113,24 @@ def convert_call(func):
           #  [1. 1. 1.]]
 
     """
+    translator_logger.log(1,
+                          "Convert callable object: convert {}.".format(func))
     func_self = None
     converted_call = None
 
+    # Function in convert_call may be decorated by another `@declarative`,
+    # in this case, unwraps it into a raw method or function.
+    if isinstance(func, StaticLayer):
+        instance = func._class_instance
+        if instance is not None:
+            func = func.dygraph_function.__get__(instance)
+        else:
+            func = func.dygraph_function
+
     if is_builtin_len(func):
         return convert_len
 
-    if is_builtin(func) or is_paddle_func(func):
+    if is_builtin(func) or is_unsupported(func):
         return func
 
     if inspect.isfunction(func):
@@ -109,12 +138,36 @@ def convert_call(func):
         if func.__name__ == '<lambda>':
             return func
         try:
-            global_funcs = set([
-                fn for fn in func.__globals__.values() if inspect.isfunction(fn)
-            ])
-            if func in global_funcs:
-                converted_call = to_static_func(func)
+            # Note(Aurelius84): Because `@declarative` returns a class instance instead of
+            # a function. This will modify the value referring to itself in `__globals__`.
+
+            # For example: 
+            #
+            #      @declarative
+            #      def foo(x):
+            #          return x
+            #
+            # `foo` will be converted into a wrapper class, suppose as `StaticLayer`.
+            # And `foo.__globals__['foo']` will still return this `StaticLayer` instead of
+            # `foo` function. So `isinstance(fn, StaticLayer)` is added here. 
+            global_functions = set()
+            for fn in func.__globals__.values():
+                if inspect.isfunction(fn):
+                    global_functions.add(fn)
+                elif isinstance(fn, StaticLayer):
+                    global_functions.add(fn.dygraph_function)
+
+            if func in global_functions:
+                converted_call = convert_to_static(func)
                 func_self = getattr(func, '__self__', None)
+            else:
+                # NOTE:
+                # If func is not in __globals__, it does not need to be transformed
+                # because it has been transformed before.
+                translator_logger.warn(
+                    "{} doesn't have to be transformed to static function because it has been transformed before, it will be run as-is."
+                    .format(func))
+                converted_call = func
         except AttributeError:
             # NOTE:
             # If func is not in __globals__, it does not need to be transformed
@@ -127,7 +180,7 @@ def convert_call(func):
             converted_call = None
     elif inspect.ismethod(func):
         try:
-            converted_call = to_static_func(func)
+            converted_call = convert_to_static(func)
             func_self = getattr(func, '__self__', None)
         except (IOError, OSError):
             # NOTE: func may have been decorated.
@@ -136,7 +189,7 @@ def convert_call(func):
     elif hasattr(func, '__class__') and hasattr(func.__class__, '__call__'):
         if hasattr(func, 'forward') and isinstance(func, Layer):
             try:
-                forward_func = to_static_func(func.forward)
+                forward_func = convert_to_static(func.forward)
                 setattr(func, 'forward', forward_func)
                 func_self = func
             except Exception:
@@ -146,15 +199,21 @@ def convert_call(func):
         else:
             try:
                 call_func = func.__class__.__call__
-                converted_call = to_static_func(call_func)
+                converted_call = convert_to_static(call_func)
                 func_self = func
             except Exception:
                 # NOTE:
                 # If `func` is a class which is being initialized, for example `convert_call(Foo)()`,
                 # it doesn't need to be transformed
                 func_self = None if func_self else func_self
+    else:
+        raise NotImplementedError(
+            "Callable {} can not be transformed at present.".format(func))
 
     if converted_call is None:
+        translator_logger.warn(
+            "{} doesn't have to be transformed to static function, and it will be run as-is."
+            .format(func))
         return func
 
     if func_self:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index 74895f08d0f09a4aaae73b868dadc4525dc1c750..5aba7ca0fdc0cfda5d79f5a66d78785df49c0baf 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import sys
 import traceback
 
@@ -38,9 +39,27 @@ def attach_error_data(error, in_runtime=False):
 
     setattr(error, ERROR_DATA, error_data)
 
+    remove_static_file()
     return error
 
 
+def remove_static_file():
+    """
+    Removes temporary files created during the transformation of dygraph to static graph.
+    """
+    del_files = set()
+    for loc in global_origin_info_map:
+        static_filepath = loc[0]
+        del_files.add(static_filepath)
+
+        filename, extension = os.path.splitext(static_filepath)
+        del_files.add(filename + ".pyc")
+
+    for filepath in del_files:
+        if os.path.exists(filepath):
+            os.remove(filepath)
+
+
 class TraceBackFrame(OriginInfo):
     """
     Traceback frame information.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..5540c63a85bd7f8db760f0c3e25be9eefa2aace7
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import six
+import inspect
+import numpy as np
+import collections
+import paddle
+from paddle.fluid import core
+from paddle.fluid.dygraph import layers
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
+from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+
+
+class FunctionSpec(object):
+    """
+    Wrapper class for a function for class method.
+    """
+
+    def __init__(self, function, input_spec=None):
+        self._dygraph_function = function
+        if input_spec is None:
+            self._input_spec = None
+            self._flat_input_spec = None
+        else:
+            self._input_spec = self._verify_input_spec(input_spec)
+            self._flat_input_spec = flatten(self._input_spec)
+
+        # parse full argument names list.
+        self._arg_names, self._default_kwargs = parse_arg_and_kwargs(function)
+
+    def unified_args_and_kwargs(self, args, kwargs):
+        """
+        Moves kwargs with default value into arguments list to keep `args` contain the same length
+        value as function definition.
+        
+        For example: 
+        
+            Given function definition: `def foo(x, a=1, b=2)`, 
+            when calling it by `foo(23)`, the args is `[23]`, kwargs is `{a=1, b=2}`.
+            In this function, it will return args with `[23, 1, 2]`, kwargs with `{}`
+
+        Args:
+            args(tuple): tuple of input arguments value of decorated function.
+            kwargs(dict): dict of input keyword arguments value of decorated function.
+
+        Return:
+            New arguments tuple containing default kwargs value.
+        """
+        if len(self._arg_names) < len(args):
+            error_msg = "The decorated function `{}` requires {} arguments: {}, but received {} with {}.".format(
+                self._dygraph_function.__name__,
+                len(self._arg_names), self._arg_names, len(args), args)
+            if args and inspect.isclass(args[0]):
+                error_msg += "\n\tMaybe the function has more than one decorator, we don't support this for now."
+                raise NotImplementedError(error_msg)
+            else:
+                raise ValueError(error_msg)
+
+        args = list(args)
+
+        for i in six.moves.range(len(args), len(self._arg_names)):
+            arg_name = self._arg_names[i]
+            if arg_name in kwargs:
+                args.append(kwargs[arg_name])
+                del kwargs[arg_name]
+            else:
+                if arg_name not in self._default_kwargs:
+                    raise ValueError(
+                        "`{}()` requires `{}` arguments, but not found in input `args`: {} and `kwargs`: {}.".
+                        format(self._dygraph_function.__name__, arg_name, args,
+                               kwargs))
+                args.append(self._default_kwargs[arg_name])
+
+        return tuple(args), kwargs
+
+    def args_to_input_spec(self, args, kwargs):
+        """
+        Converts input arguments into InputSpec.
+        
+        1. If specific input_spec, use them to construct feed layers.
+        2. If input_spec is None, consider all Tensor and Numpy.ndarray as feed layers
+
+        Args:
+            args(tuple): tuple of input arguments value of function containing default kwargs value.
+            kwargs(dict): kwargs arguments received by **kwargs.
+
+        Return:
+            Same nest structure with args by replacing value with InputSpec.
+        """
+        input_with_spec = []
+
+        if self._input_spec is not None:
+            # Note: Because the value type and length of `kwargs` is uncertain.
+            # So we don't support to deal this case while specificing `input_spec` currently.
+            if kwargs:
+                raise ValueError(
+                    "{} got unexpected keyword arguments: {}. Cannot trace the function when `input_spec` is specificed.".
+                    format(self._dygraph_function.__name__, kwargs))
+
+            # Note: The length of `input_spec` can be greater than `args`,
+            # because `args` may contains non-tensor value merged form `kwargs`
+            # after `unified_args_and_kwargs`.
+            if len(args) < len(self._input_spec):
+                raise ValueError(
+                    "Requires len(arguments) >= len(input_spec), but received len(args):{} < len(InputSpec): {}".
+                    format(len(args), len(self._input_spec)))
+
+            # replace argument with corresponding InputSpec.
+            input_with_spec = convert_to_input_spec(args, self._input_spec)
+        else:
+            for idx, input_var in enumerate(flatten(args)):
+                if isinstance(input_var, np.ndarray):
+                    input_var = paddle.static.InputSpec.from_numpy(input_var)
+                elif isinstance(input_var, core.VarBase):
+                    input_var = paddle.static.InputSpec.from_tensor(input_var)
+
+                input_with_spec.append(input_var)
+
+            input_with_spec = pack_sequence_as(args, input_with_spec)
+
+        return input_with_spec
+
+    @switch_to_static_graph
+    def to_static_inputs_with_spec(self, input_with_spec, main_program):
+        """
+        Constructs feed layer by inputs with InputSpec information for main program.
+
+        Args:
+            input_with_spec(tuple): input arguments by replacing argument with InputSpec.
+            main_program(Program): main program for inserting feed layer.
+        """
+        flat_input_spec = flatten(input_with_spec)
+
+        inputs = []
+        block = main_program.global_block()
+        for i, var_spec in enumerate(flat_input_spec):
+            if isinstance(var_spec, paddle.static.InputSpec):
+                feed_layer = block.create_var(
+                    # TODO(Aurelius84): consider a more elegant way to name this
+                    name=var_spec.name or "feed_%s" % i,
+                    shape=var_spec.shape,
+                    dtype=var_spec.dtype,
+                    is_data=True,
+                    need_check_feed=False)
+            else:
+                feed_layer = var_spec
+            inputs.append(feed_layer)
+
+        return pack_sequence_as(input_with_spec, inputs)
+
+    def _verify_input_spec(self, input_spec):
+        """
+        Verifies the `input_spec` and its element type is valid.
+        """
+        if not isinstance(input_spec, (tuple, list)):
+            raise TypeError(
+                "The type(input_spec) should be one of (tuple, list), but received {}.".
+                format(type_name(input_spec)))
+        input_spec = tuple(input_spec)
+        for spec in flatten(input_spec):
+            if not isinstance(spec, paddle.static.InputSpec):
+                raise ValueError(
+                    "The type(elem) from input_spec should be `InputSpec`, but received {}.".
+                    format(type_name(spec)))
+
+        return input_spec
+
+    def __repr__(self):
+        return "function: {}({}), input_spec: {}".format(
+            self._dygraph_function.__name__, ','.join(self._arg_names),
+            self._input_spec)
+
+    @property
+    def dygraph_function(self):
+        return self._dygraph_function
+
+    @property
+    def args_name(self):
+        return self._arg_names
+
+    @property
+    def input_spec(self):
+        return self._input_spec
+
+    @property
+    def flat_input_spec(self):
+        return self._flat_input_spec
+
+    @property
+    def code(self):
+        return func_to_source_code(self._dygraph_function)
+
+
+def get_parameters(layer_instance, include_sublayer=True):
+    """
+    Returns parameters of decorated layers. If set `include_sublayer` True,
+    the parameters created in sub layers will be added.
+    """
+    params = collections.OrderedDict()
+    if layer_instance is not None:
+        if isinstance(layer_instance, layers.Layer):
+            if include_sublayer:
+                params = layer_instance.parameters()
+                names = [p.name for p in params]
+                params = collections.OrderedDict(zip(names, params))
+            else:
+                params = layer_instance._parameters
+        else:
+            raise TypeError(
+                "Type of `layer_instance` should be nn.Layer, but received {}".
+                format(type_name(layer_instance)))
+
+    return params
+
+
+def get_buffers(layer_instance, include_sublayer=True):
+    """
+    Returns Variable buffers of decorated layers. If set `include_sublayer` True,
+    the Variable buffers created in sub layers will be added.
+    """
+    buffers = collections.OrderedDict()
+    if layer_instance is not None:
+        if isinstance(layer_instance, layers.Layer):
+            if include_sublayer:
+                buffers = layer_instance.buffers()
+                names = [buffer.name for buffer in buffers]
+                buffers = collections.OrderedDict(zip(names, buffers))
+            else:
+                buffers = layer_instance._buffers
+        else:
+            raise TypeError(
+                "Type of `layer_instance` should be nn.Layer, but received {}".
+                format(type_name(layer_instance)))
+    return buffers
+
+
+def convert_to_input_spec(inputs, input_spec):
+    """
+    Replaces tensor in structured `inputs` by InputSpec in `input_spec`.
+    
+    Args:
+        inputs(list|dict): nested structure list or dict.
+        input_spec(list|dict): same nested structure list or dict as inputs. 
+
+    
+    Return:
+        Same structure with inputs by replacing the element with specified InputSpec.
+    """
+
+    def check_type_and_len(input, spec, check_length=False):
+        if type(input) is not type(spec):
+            raise TypeError('type(input) should be {}, but received {}.'.format(
+                type(spec), type(input)))
+        if check_length and len(input) < len(spec):
+            raise ValueError(
+                'Requires len(inputs) >= len(input_spec), but received len(inputs):{} < len(input_spec):{}'.
+                format(len(inputs), len(input_spec)))
+
+    if isinstance(input_spec, (tuple, list)):
+        input_with_spec = []
+        check_type_and_len(inputs, input_spec, True)
+
+        for i, spec in enumerate(input_spec):
+            out_spec = convert_to_input_spec(inputs[i], spec)
+            input_with_spec.append(out_spec)
+
+        # Note: If the rest inputs contain tensor or numpy.ndarray
+        # without specific InputSpec, raise warning.
+        if len(inputs) > len(input_spec):
+            for rest_input in inputs[len(input_spec):]:
+                if isinstance(rest_input, (core.VarBase, np.ndarray)):
+                    logging.warning(
+                        "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
+                        "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
+                        format(type_name(rest_input)))
+        input_with_spec.extend(inputs[len(input_spec):])
+
+        return input_with_spec
+    elif isinstance(input_spec, dict):
+        input_with_spec = {}
+        check_type_and_len(inputs, input_spec, True)
+        for name, input in inputs.items():
+            if name in input_spec:
+                input_with_spec[name] = convert_to_input_spec(input,
+                                                              input_spec[name])
+            else:
+                input_with_spec[name] = input
+        return input_with_spec
+    elif isinstance(input_spec, paddle.static.InputSpec):
+        return input_spec
+    else:
+        raise TypeError(
+            "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
+            type_name(input_spec))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75cb65085846d672d2488c98bf6ad625ac12e78b
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import threading
+
+import six
+from paddle.fluid import log_helper
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+
+__all__ = ["TranslatorLogger", "set_verbosity", "set_code_level"]
+
+VERBOSITY_ENV_NAME = 'TRANSLATOR_VERBOSITY'
+CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
+DEFAULT_VERBOSITY = -1
+DEFAULT_CODE_LEVEL = -1
+
+
+def synchronized(func):
+    def wrapper(*args, **kwargs):
+        with threading.Lock():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+class TranslatorLogger(object):
+    """
+    class for Logging and debugging during the tranformation from dygraph to static graph.
+    The object of this class is a singleton.
+    """
+
+    @synchronized
+    def __new__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = object.__new__(cls, *args, **kwargs)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+
+        self._initialized = True
+        self._logger = log_helper.get_logger(
+            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+        self._verbosity_level = None
+        self._transformed_code_level = None
+
+    @property
+    def logger(self):
+        return self._logger
+
+    @property
+    def verbosity_level(self):
+        if self._verbosity_level is not None:
+            return self._verbosity_level
+        else:
+            return int(os.getenv(VERBOSITY_ENV_NAME, DEFAULT_VERBOSITY))
+
+    @verbosity_level.setter
+    def verbosity_level(self, level):
+        self.check_level(level)
+        self._verbosity_level = level
+
+    @property
+    def transformed_code_level(self):
+        if self._transformed_code_level is not None:
+            return self._transformed_code_level
+        else:
+            return int(os.getenv(CODE_LEVEL_ENV_NAME, DEFAULT_CODE_LEVEL))
+
+    @transformed_code_level.setter
+    def transformed_code_level(self, level):
+        self.check_level(level)
+        self._transformed_code_level = level
+
+    def check_level(self, level):
+        if isinstance(level, (six.integer_types, type(None))):
+            rv = level
+        else:
+            raise TypeError("Level is not an integer: {}".format(level))
+        return rv
+
+    def has_code_level(self, level):
+        level = self.check_level(level)
+        return level == self.transformed_code_level
+
+    def has_verbosity(self, level):
+        level = self.check_level(level)
+        return level >= self.verbosity_level
+
+    def error(self, msg, *args, **kwargs):
+        self.logger.error(msg, *args, **kwargs)
+
+    def warn(self, msg, *args, **kwargs):
+        self.logger.warn(msg, *args, **kwargs)
+
+    def log(self, level, msg, *args, **kwargs):
+        if self.has_verbosity(level):
+            self.logger.log(level, msg, *args, **kwargs)
+
+    def log_transformed_code(self, level, ast_node, transformer_name, *args,
+                             **kwargs):
+        if self.has_code_level(level):
+            source_code = ast_to_source_code(ast_node)
+            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                .format(level, transformer_name)
+
+            msg = header_msg + source_code
+            self.logger.info(msg, *args, **kwargs)
+
+
+_TRANSLATOR_LOGGER = TranslatorLogger()
+
+
+def set_verbosity(level=0):
+    """
+    Sets the verbosity level of log for dygraph to static graph.
+    There are two means to set the logging verbosity:
+     1. Call function `set_verbosity`
+     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    **Note**:
+    `set_verbosity` has a higher priority than the environment variable.
+
+    Args:
+        level(int): The verbosity level. The larger value idicates more verbosity.
+            The default value is 0, which means no logging.
+
+    Examples:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            paddle.jit.set_verbosity(1)
+            # The verbosity level is now 1
+
+            os.environ['TRANSLATOR_VERBOSITY'] = '3'
+            # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
+    """
+    _TRANSLATOR_LOGGER.verbosity_level = level
+
+
+def get_verbosity():
+    return _TRANSLATOR_LOGGER.verbosity_level
+
+
+LOG_AllTransformer = 100
+
+
+def set_code_level(level=LOG_AllTransformer):
+    """
+    Sets the level to print code from specific level of Ast Transformer.
+    There are two means to set the code level:
+     1. Call function `set_code_level`
+     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    **Note**:
+    `set_code_level` has a higher priority than the environment variable.
+
+    Args:
+        level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.jit.set_code_level(2)
+            # It will print the transformed code at level 2, which means to print the code after second transformer,
+            # as the date of August 28, 2020, it is CastTransformer.
+
+            os.environ['TRANSLATOR_CODE_LEVEL'] = '3'
+            # The code level is now 3, but it has no effect because it has a lower priority than `set_code_level`
+
+    """
+    _TRANSLATOR_LOGGER.transformed_code_level = level
+
+
+def get_code_level():
+    return _TRANSLATOR_LOGGER.transformed_code_level
+
+
+def error(msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.error(msg, *args, **kwargs)
+
+
+def warn(msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.warn(msg, *args, **kwargs)
+
+
+def log(level, msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.log(level, msg, *args, **kwargs)
+
+
+def log_transformed_code(level, ast_node, transformer_name, *args, **kwargs):
+    _TRANSLATOR_LOGGER.log_transformed_code(level, ast_node, transformer_name,
+                                            *args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index aeece9513b57710b767322c2a7986eec087b4f8d..13f38b0726c27566ff0eda41d6c365e6a7e4aa4b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -18,8 +18,8 @@ import collections
 import inspect
 
 import gast
-
 from paddle.fluid import core
+from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
 from paddle.fluid.framework import Program
 
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
@@ -197,18 +197,6 @@ def attach_origin_info(ast_node, func):
     return ast_node
 
 
-# NOTE: inspect.unwrap() exits in PY3 but not in PY2.
-def unwrap(func):
-    def _is_wrapped(f):
-        return hasattr(f, '__wrapped__')
-
-    unwrapped_f = func
-    while (_is_wrapped(unwrapped_f)):
-        unwrapped_f = unwrapped_f.__wrapped__
-
-    return unwrapped_f
-
-
 def ast_walk(transformed_node, static_node):
     """
     Recursively yield all descendant nodes in the trees starting at transformed_node and static_node (including itself) in parallel.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 88562dd40a63b3da50b34bd1cb5c1094aef1ae42..698d989343a23015529a3b37b285640466d1c30d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -14,34 +14,44 @@
 
 from __future__ import print_function
 import gast
+import collections
+import logging
 import inspect
-import warnings
+import six
 import textwrap
 import threading
-import collections
-import numpy as np
-from paddle.fluid import core, scope_guard
+import warnings
+
+import gast
 from paddle.fluid import framework
-from paddle.fluid import executor
-from paddle.fluid import unique_name
 from paddle.fluid.dygraph import layers
+from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten
-from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
+from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
+from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
+from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
+from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
+from paddle.fluid.dygraph.dygraph_to_static.utils import make_hashable
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import get_buffers, get_parameters
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
-from paddle.fluid.dygraph.base import param_guard
-from paddle.fluid.data_feeder import check_type
-from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
-from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info, create_and_update_origin_info_map
-from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
-from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data, ERROR_DATA
 
 __all__ = ['ProgramTranslator', 'convert_to_static']
 
+# For each traced function, we set `max_traced_program_count` = 10 to consider caching performance.
+# Once exceeding the threshold, we will raise warning to users to make sure the conversion is as expected.
+MAX_TRACED_PROGRAM_COUNT = 10
+
 
 class FunctionCache(object):
     """
@@ -89,7 +99,7 @@ class FunctionCache(object):
         """
         # Note: In Python2, it will raise OSError when inspect function
         # with decorator directly and function.__wrapped__ holds the actual function.
-        func = getattr(func, '__wrapped__', func)
+        func = unwrap(func)
         source_code = func_to_source_code(func)
 
         # TODO(liym27):
@@ -130,100 +140,323 @@ def convert_to_static(function):
         return static_func
 
 
-class FunctionSpec(object):
-    def __init__(self, func, args, kwargs):
-        self._dyfunc = func
-        self._args = args
-        self._kwargs = kwargs
+class CacheKey(object):
+    """
+    Cached key for ProgramCache.
+    """
 
-        # TODO(liym27): func has multi layer decorator
-        dyfunc = getattr(func, '__wrapped__', func)
-        self._dyfunc_code = inspect.getsource(dyfunc)
+    __slots__ = ['function_spec', 'input_with_spec', 'class_instance']
 
-    def is_method(self):
-        return self._args and isinstance(self._args[0], layers.Layer)
+    def __init__(self, function_spec, input_with_spec, class_instance):
+        """
+        Initializes a cache key.
 
-    def parameters(self, include_sublayer=True):
+        Args:
+            functions_spec(FunctionSpec): a FunctionSpec instance of decorated function.
+            input_with_spec(list[InputSpec]): actual inputs with some arguments replaced by InputSpec.
+            class_instance(object): a instance of class `Layer`.
         """
-        Returns parameters of decorated layers. If set `include_sublayer` True,
-        the parameters created in sub layers will be added.
+        self.function_spec = function_spec
+        self.input_with_spec = input_with_spec
+        self.class_instance = class_instance
+
+    @classmethod
+    def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
+        """
+        Generated a CacheKey instance by given inputs.
+
+        Args:
+            functions_spec(FunctionSpec): a FunctionSpec instance of decorated function.
+            args(tuple): tuple of actual inputs arguments.
+            kwargs(dict): dict of actual inputs keyword arguments.
+            class_instance(object): a instance of class `Layer`.
         """
-        params = collections.OrderedDict()
-        if self.is_method():
-            layer_instance = self._args[0]
-            if include_sublayer:
-                params = layer_instance.parameters()
-                names = [p.name for p in params]
-                params = collections.OrderedDict(zip(names, params))
+        # 1. filter `self` in args
+        if args and isinstance(args[0], layers.Layer):
+            args = args[1:]
+        # 2. convert tensor and numpy array into InputSpec 
+        _args, _kwargs = function_spec.unified_args_and_kwargs(args, kwargs)
+        input_with_spec = function_spec.args_to_input_spec(_args, _kwargs)
+
+        # 3. check whether hit the cache or build a new program for the input arguments
+        return CacheKey(function_spec, input_with_spec, class_instance)
+
+    def __hash__(self):
+        error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
+        return hash((id(self.function_spec),
+                     make_hashable(self.input_with_spec, error_msg),
+                     self.class_instance))
+
+    def __eq__(self, other):
+        return (type(self) is type(other)) and hash(self) == hash(other)
+
+    def __neq__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return "id(function_spec): {}, input_with_spec: {}, class_instance: {}".format(
+            id(self.function_spec), self.input_with_spec, self.class_instance)
+
+
+def unwrap_decorators(func):
+    """
+    Unwraps a decorated function and returns the decorator list and inner target.
+    """
+    decorators = []
+    cur = func
+    while True:
+        if isinstance(cur, StaticLayer):
+            decorators.append(cur)
+            # Note: if `cur` is a method, keep it as bound method of class.
+            instance = cur._class_instance
+            if instance is not None:
+                cur = cur.dygraph_function.__get__(instance)
             else:
-                params = layer_instance._parameters
-        return params
+                cur = cur.dygraph_function
+        else:
+            break
+    return decorators, cur
+
+
+class StaticLayer(object):
+    """
+    Wrapper class to Manage program conversion of decorated function.
 
-    def buffers(self, include_sublayer=True):
+    """
+
+    def __init__(self, function, input_spec=None):
         """
-        Returns Variable buffers of decorated layers. If set `include_sublayer` True,
-        the Variable buffers created in sub layers will be added.
+        Initializes a `StaticLayer`.
+
+        Args:
+            function(callable): A function or method that will be converted into static program.
+            input_spec(list[InputSpec]): list of InputSpec to specify the `shape/dtype/name` information for each input argument, default None.
         """
-        buffers = collections.OrderedDict()
-        if self.is_method():
-            layer_instance = self._args[0]
-            if include_sublayer:
-                buffers = layer_instance.buffers()
-                names = [buffer.name for buffer in buffers]
-                buffers = collections.OrderedDict(zip(names, buffers))
+        # save the instance `self` while decorating a method of class.
+        if inspect.ismethod(function):
+            self._dygraph_function = getattr(function, '__func__')
+            self._class_instance = getattr(function, '__self__')
+        else:
+            self._dygraph_function = function
+            self._class_instance = None
+
+        self._input_spec = input_spec
+        self._function_spec = FunctionSpec(function, input_spec)
+        self._program_cache = ProgramCache()
+        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        self._program_trans = ProgramTranslator()
+
+    def __get__(self, instance, owner):
+        """
+        Overrides this method to parse the class instance and call bound method correctly.
+
+        For example:
+            
+            '''
+            class Net(Layer):
+                def __init__(self):
+                    pass
+                
+                @paddle.jit.to_static
+                def forward(self, x, y):
+                    return x + y
+
+            net = Net()
+            out = net(x, y)
+            '''
+        
+        In above case, `net(x, y)` will call `net.forward(x, y)` firstly that is a bound method
+        of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
+        to parse the class instance correctly instead of the `StaticLayer` instance.
+        """
+        self._class_instance = instance
+        return self
+
+    def __call__(self, *args, **kwargs):
+        """
+        Supports to call the returned instance with input `args` and `kwargs` directly.
+
+        Args:
+            *args(tuple): tuple of all input arguments from original decorated function.
+            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+
+        Return:
+            Outputs of decorated function.
+        """
+        # 1. call dygraph function directly if not enable `declarative`
+        if not self._program_trans.enable_declarative:
+            warnings.warn(
+                "The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. "
+                "We will just return dygraph output.")
+            return self._call_dygraph_function(*args, **kwargs)
+
+        # 2. trace ops from dygraph layers and cache the generated program.
+        args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
+        try:
+            concrete_program, partial_program_layer = self.get_concrete_program(
+                *args, **kwargs)
+
+            # 3. synchronize self.training attribute.
+            if isinstance(self._class_instance, layers.Layer):
+                partial_program_layer.training = self._class_instance.training
+
+            # 4. return outputs.
+            return partial_program_layer(args)
+        except Exception as e:
+            if not hasattr(e, ERROR_DATA):
+                # runtime error
+                attach_error_data(e, in_runtime=True)
+            error_data = getattr(e, ERROR_DATA, None)
+            if error_data:
+                new_exception = error_data.create_exception()
+                if six.PY3:
+                    # NOTE(liym27):
+                    # 1. Why `raise new_exception from None`?
+                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+                    #   caught exception.
+                    # 2. Use exec to bypass syntax error checking in Python 2.
+
+                    six.exec_("raise new_exception from None")
+                else:
+                    raise new_exception
             else:
-                buffers = layer_instance._buffers
-        return buffers
+                raise
 
-    @switch_to_static_graph
-    def to_static_inputs(self, main_program):
-        inputs = []
-        block = main_program.global_block()
-        for input_var in flatten(self.args):
-            if isinstance(input_var, np.ndarray):
-                feed_layer = block.create_var(
-                    name=unique_name.generate('feed'),
-                    shape=list(input_var.shape),
-                    dtype=input_var.dtype,
-                    is_data=True,
-                    need_check_feed=False)
-            elif isinstance(input_var, core.VarBase):
-                feed_layer = block.create_var(
-                    name=input_var.name,
-                    shape=list(input_var.shape),
-                    dtype=input_var.dtype,
-                    stop_gradient=input_var.stop_gradient,
-                    need_check_feed=False)
+    def _call_dygraph_function(self, *args, **kwargs):
+        """
+        Calls dygraph function directly and returns the outputs.
+
+        Args:
+            *args(tuple): tuple of all input arguments from original decorated function.
+            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+
+        Return:
+            Outputs of dygraph function.
+        """
+        if self._class_instance is not None:
+            dygraph_function = self._dygraph_function.__get__(
+                self._class_instance)
+        else:
+            dygraph_function = self._dygraph_function
+
+        return dygraph_function(*args, **kwargs)
+
+    def get_concrete_program(self, *args, **kwargs):
+        """
+        Returns traced concrete program and inner executable partial layer.
+
+        Args:
+            *args(tuple): input arguments values or InputSpec
+            **kwargs(dict) : input kwargs values.
+
+        Returns:
+            Traced ConcreteProgram and executable translated Layer.
+        """
+        # 1. unify args/kwargs and replace Tensor with InputSpec
+        if len(args) != len(self._function_spec.args_name):
+            args, kwargs = self._function_spec.unified_args_and_kwargs(args,
+                                                                       kwargs)
+        input_with_spec = self._function_spec.args_to_input_spec(args, kwargs)
+
+        # 2. generate cache key
+        cache_key = CacheKey(self._function_spec, input_with_spec,
+                             self._class_instance)
+
+        # 3. check whether hit the cache or build a new program for the input arguments
+        concrete_program, partial_program_layer = self._program_cache[cache_key]
+        return concrete_program, partial_program_layer
+
+    def get_traced_count(self):
+        """
+        Returns the number of traced programs for the decorated function.
+        """
+        return len(self._program_cache)
+
+    @property
+    def code(self):
+        """
+        Returns the source code of transformed static function for debugging.
+        """
+        static_func = convert_to_static(self._dygraph_function)
+        source_code = func_to_source_code(static_func)
+        return source_code
+
+    @property
+    def dygraph_function(self):
+        """
+        Returns the original decorated function.
+        """
+        return self._dygraph_function
+
+    @property
+    def concrete_program(self):
+        """
+        Returns recent ConcreteProgram instance of decorated function.
+        """
+        # if specific the `input_spec`, the length of program_cache will always 1,
+        # else, return the last one.
+        cached_program_len = len(self._program_cache)
+        # If specific `input_spec`, apply convertion from dygraph layers into static Program.
+        if cached_program_len == 0:
+            if len(self._function_spec.flat_input_spec) > 0:
+                input_spec = self._function_spec.input_spec
+                concrete_program, _ = self.get_concrete_program(*input_spec)
+                return concrete_program
             else:
-                feed_layer = input_var
+                raise ValueError("No valid transformed program for {}".format(
+                    self._function_spec))
+        # If more than one programs have been cached, return the recent converted program by default.
+        elif cached_program_len > 1:
+            logging.warning(
+                "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
+                format(self._function_spec, cached_program_len))
+
+        cache_key, (concrete_program,
+                    partial_layer) = self._program_cache.last()
+        return concrete_program
 
-            inputs.append(feed_layer)
-        # Restores the nested structure as self.args
-        return pack_sequence_as(self.args, inputs)
+    @property
+    def inputs(self):
+        """
+        Returns input tensors of recent converted static program.
+        """
+        concrete_program = self.concrete_program
+        inputs = [
+            var for var in flatten(concrete_program.inputs)
+            if isinstance(var, framework.Variable)
+        ]
+        return inputs
 
     @property
-    def dyfunc(self):
-        return self._dyfunc
+    def outputs(self):
+        """
+        Returns output tensors of recent converted static program.
+        """
+        concrete_program = self.concrete_program
+        outputs = [
+            var for var in flatten(concrete_program.outputs)
+            if isinstance(var, framework.Variable)
+        ]
+
+        return outputs
 
     @property
-    def args(self):
-        return self._args
-
-    def __key(self):
-        # Note: if dygraph function is a method of class,
-        # consider instance info as hash key.
-        if self.is_method():
-            # NOTE: we can use Layer's (instance + function code) as hash key.
-            # An instance will not hold two identical methods 
-            return self._dyfunc_code, self._args[0]
-        else:
-            return self._dyfunc
+    def main_program(self):
+        """
+        Returns recent converted static main program.
+        """
+        concrete_program = self.concrete_program
+        main_program = concrete_program.main_program
+        return main_program
 
-    def __hash__(self):
-        return hash(self.__key())
+    @property
+    def program_cache(self):
+        return self._program_cache
 
-    def __eq__(self, other):
-        return self.__key() == self.__key()
+    @property
+    def function_spec(self):
+        return self._function_spec
 
 
 # Flag that indicates whether running code under `@declarative`
@@ -249,11 +482,17 @@ def _switch_declarative_mode_guard_(is_declarative=True):
 
 
 class ConcreteProgram(object):
+
+    __slots__ = [
+        'inputs', 'outputs', 'main_program', "startup_program", "parameters",
+        "function"
+    ]
+
     def __init__(self,
                  inputs,
                  outputs,
                  parameters,
-                 func,
+                 function,
                  main_program,
                  startup_program=None):
         self.inputs = inputs
@@ -261,17 +500,21 @@ class ConcreteProgram(object):
         self.main_program = main_program
         self.startup_program = startup_program
         self.parameters = parameters
-        self.func_spec = func
+        self.function = function
 
     @staticmethod
     @switch_to_static_graph
-    def from_func_spec(func_spec):
+    def from_func_spec(func_spec, input_spec, class_instance):
         """
         Builds the main_program with specialized inputs and returns outputs
         of program as fetch_list.
+
+        Args:
+            func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
+            input_spec(list[InputSpec]): 
         """
         # Transforms dygraph function into static function and caches it.
-        dygraph_function = func_spec.dyfunc
+        dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
 
         main_program, startup_program = framework.Program(), framework.Program()
@@ -285,15 +528,20 @@ class ConcreteProgram(object):
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
-                inputs = func_spec.to_static_inputs(main_program)
+                inputs = func_spec.to_static_inputs_with_spec(input_spec,
+                                                              main_program)
+                if class_instance:
+                    inputs = tuple([class_instance] + list(inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
-                all_parameters_and_buffers = list(func_spec.parameters().values(
-                )) + list(func_spec.buffers().values())
+                all_parameters_and_buffers = list(
+                    get_parameters(class_instance).values()) + list(
+                        get_buffers(class_instance).values())
 
                 # 3. Builds program only once and returns the output Variables.
-                with param_guard(func_spec.parameters(False)), param_guard(
-                        func_spec.buffers(False)):
+                with param_guard(get_parameters(
+                        class_instance, False)), param_guard(
+                            get_buffers(class_instance, False)):
                     try:
                         outputs = static_func(*inputs)
                     except BaseException as e:
@@ -311,7 +559,7 @@ class ConcreteProgram(object):
             inputs=inputs,
             outputs=outputs,
             parameters=all_parameters_and_buffers,
-            func=dygraph_function,
+            function=dygraph_function,
             main_program=main_program,
             startup_program=startup_program)
 
@@ -324,27 +572,38 @@ class ProgramCache(object):
     def __init__(self):
         self._caches = collections.OrderedDict()
 
-    def _build_once(self, func_spec):
-        concrete_program = ConcreteProgram.from_func_spec(func_spec)
+    def _build_once(self, cache_key):
+        concrete_program = ConcreteProgram.from_func_spec(
+            func_spec=cache_key.function_spec,
+            input_spec=cache_key.input_with_spec,
+            class_instance=cache_key.class_instance)
         return concrete_program, partial_program_from(concrete_program)
 
     def __getitem__(self, item):
-        if not isinstance(item, FunctionSpec):
-            raise ValueError(
-                'type(item) should be FunctionSpec, but received %s' %
-                type(item))
+        if not isinstance(item, CacheKey):
+            raise ValueError('type(item) should be CacheKey, but received %s' %
+                             type_name(item))
+
         if item not in self._caches:
             self._caches[item] = self._build_once(item)
+            # Note: raise warnings if number of traced program is more than `max_tracing_count`
+            current_tracing_count = len(self._caches)
+            if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
+                logging.warning(
+                    "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
+                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
+                    format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
+
         return self._caches[item]
 
     def get_program(self, item):
-        if not isinstance(item, FunctionSpec):
+        if not isinstance(item, CacheKey):
             raise ValueError(
                 "Input item's type should be FunctionSpec, but received %s" %
-                type(item))
+                type_name(item))
         if item not in self._caches:
             raise RuntimeError(
-                "Failed to find program for input item, please decorate input function by `@declarative`."
+                "Failed to find program for input item, please decorate input function by `@paddle.jit.to_static`."
             )
         return self._caches[item]
 
@@ -354,6 +613,12 @@ class ProgramCache(object):
         key = next(reversed(self._caches.keys()))
         return key, self._caches[key]
 
+    def __len__(self):
+        return len(self._caches)
+
+    def concrete_programs(self):
+        return [cp for key, (cp, _) in self._caches.iteritems()]
+
 
 def synchronized(func):
     func.__lock__ = threading.Lock()
@@ -502,9 +767,11 @@ class ProgramTranslator(object):
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
 
-        function_spec = FunctionSpec(dygraph_func, args, kwargs)
-        concrete_program, partial_program_layer = self._program_cache[
-            function_spec]
+        function_spec = FunctionSpec(dygraph_func)
+        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                getattr(dygraph_func,
+                                                        '__self__', None))
+        _, partial_program_layer = self._program_cache[cache_key]
 
         if args and isinstance(args[0], layers.Layer):
             # Synchronize self.training attribute.
@@ -618,8 +885,12 @@ class ProgramTranslator(object):
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
 
-        func_spec = FunctionSpec(dygraph_func, args, kwargs)
-        concrete_program, _ = self._program_cache[func_spec]
+        function_spec = FunctionSpec(dygraph_func)
+        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                getattr(dygraph_func,
+                                                        '__self__', None))
+        concrete_program, partial_program_layer = self._program_cache[cache_key]
+
         # Note: concrete_program hold all input/output infos include non-Variable
         input_vars = [
             var for var in concrete_program.inputs
@@ -669,7 +940,9 @@ class ProgramTranslator(object):
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_code"
         # Gets AST from dygraph function
-        raw_code = inspect.getsource(dygraph_func)
+
+        unwrap_func = unwrap(dygraph_func)
+        raw_code = inspect.getsource(unwrap_func)
         code = textwrap.dedent(raw_code)
         root = gast.parse(code)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index bb5b2843c92e2b1ed88b002bb1511c07ddd61f37..ba02a983f8e641079d8a60b166a6f098e6f725a8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -18,16 +18,23 @@ import ast
 import astor
 import atexit
 import copy
+import collections
 import gast
-import imp
 import inspect
 import os
 import six
 import tempfile
 import textwrap
+import numpy as np
 
 from paddle.fluid import unique_name
 
+# imp is deprecated in python3
+if six.PY2:
+    import imp
+else:
+    from importlib.machinery import SourceFileLoader
+
 dygraph_class_to_static_api = {
     "CosineDecay": "cosine_decay",
     "ExponentialDecay": "exponential_decay",
@@ -41,6 +48,77 @@ dygraph_class_to_static_api = {
 FOR_ITER_INDEX_PREFIX = '__for_loop_var_index'
 FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len'
 
+# FullArgSpec is valid from Python3. Defined a Namedtuple to
+# to make it available in Python2.
+FullArgSpec = collections.namedtuple('FullArgSpec', [
+    'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+    'annotations'
+])
+
+
+def getfullargspec(target):
+    if hasattr(inspect, "getfullargspec"):
+        return inspect.getfullargspec(target)
+    else:
+        argspec = inspect.getargspec(target)
+        return FullArgSpec(
+            args=argspec.args,
+            varargs=argspec.varargs,
+            varkw=argspec.keywords,
+            defaults=argspec.defaults,
+            kwonlyargs=[],
+            kwonlydefaults=None,
+            annotations={})
+
+
+def parse_arg_and_kwargs(function):
+    """
+    Returns full argument names as list. e.g ['x', 'y', 'z']
+    """
+    fullargspec = getfullargspec(function)
+    arg_names = fullargspec.args
+    if arg_names and 'self' == arg_names[0]:
+        arg_names = fullargspec.args[1:]
+
+    # parse default kwargs
+    default_kwargs = {}
+    default_values = fullargspec.defaults
+    if default_values:
+        assert len(default_values) <= len(arg_names)
+        default_kwarg_names = arg_names[-len(default_values):]
+        default_kwargs = dict(zip(default_kwarg_names, default_values))
+
+    return arg_names, default_kwargs
+
+
+def type_name(v):
+    return type(v).__name__
+
+
+def make_hashable(x, error_msg=None):
+    """
+    Makes input `x` hashable.
+
+    For some unhashable objects, such as `dict/list/np.ndarray`,applying hash function by using their values.
+    """
+    if isinstance(x, (tuple, list)):
+        return tuple(map(make_hashable, x))
+
+    try:
+        hash(x)
+    except TypeError:
+        if isinstance(x, np.ndarray):
+            # Note: `tostring()` will return the binary data from np.ndarray that
+            # means different value will lead to different hash code.
+            return hash(x.tostring())
+        elif isinstance(x, dict):
+            return tuple(map(make_hashable, x.values()))
+
+        error_msg = error_msg or "Requires a hashable object."
+        raise ValueError(error_msg + " But received type: %s" % type_name(x))
+
+    return x
+
 
 def _is_api_in_module_helper(obj, module_prefix):
     m = inspect.getmodule(obj)
@@ -368,9 +446,15 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
     TODO: If only decorate one of inner function instead of decorating the main
     function, the other inner functions are invisible for the decorated function.
     """
+
+    def remove_if_exit(filepath):
+        if os.path.exists(filepath):
+            os.remove(filepath)
+
     source = ast_to_source_code(ast_root)
     import_fluid = "import paddle.fluid as fluid\n"
     source = import_fluid + source
+
     if six.PY2:
         source = source.encode('utf-8')
         f = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False)
@@ -382,8 +466,13 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
         f.write(source)
 
     if delete_on_exit:
-        atexit.register(lambda: os.remove(f.name))
-    module = imp.load_source(module_name, f.name)
+        atexit.register(lambda: remove_if_exit(f.name))
+        atexit.register(lambda: remove_if_exit(f.name[:-3] + ".pyc"))
+
+    if six.PY2:
+        module = imp.load_source(module_name, f.name)
+    else:
+        module = SourceFileLoader(module_name, f.name).load_module()
     func_name = dyfunc.__name__
     if not hasattr(module, func_name):
         raise ValueError(
@@ -1045,3 +1134,19 @@ class SplitAssignTransformer(gast.NodeTransformer):
             value_node = target
 
         return new_nodes
+
+
+# NOTE: inspect.unwrap() exits in PY3 but not in PY2.
+def unwrap(func):
+    """
+    Returns the object wrapped by decorators.
+    """
+
+    def _is_wrapped(f):
+        return hasattr(f, '__wrapped__')
+
+    unwrapped_f = func
+    while (_is_wrapped(unwrapped_f)):
+        unwrapped_f = unwrapped_f.__wrapped__
+
+    return unwrapped_f
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 7396289392affa92e69e9f55fba622fd13fa979f..7f3d450a49c7d3fcc9ca1d3c2d7c5eb732671c6c 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -23,6 +23,7 @@ from paddle import compat as cpt
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid import backward
+from paddle.fluid import unique_name
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers import nn
 from paddle.fluid.dygraph.base import switch_to_static_graph
@@ -31,6 +32,9 @@ __all__ = ['TranslatedLayer']
 
 VARIABLE_FILENAME = "__variables__"
 EXTRA_VAR_INFO_FILENAME = "__variables.info__"
+LOADED_VAR_SUFFIX = "load"
+PARAMETER_NAME_PREFIX = "param"
+BUFFER_NAME_PREFIX = "buffer"
 
 
 def _load_program_desc(model_file_path):
@@ -107,33 +111,30 @@ def _get_all_var_names(program_desc):
     return all_var_names
 
 
+@switch_to_static_graph
 def _append_loaded_suffix(name):
     """
     Append loaded suffix to the given variable name
-    e.g. x ==> x@LOADED
+    e.g. x ==> x.load_0, x.load_0 ==> x.load_0.load_0
     """
-    suffix = core.loaded_var_suffix()
+    suffix = LOADED_VAR_SUFFIX
     name = cpt.to_text(name)
-    if suffix not in name:
-        name = name + suffix
-    return name
+    new_name = unique_name.generate_with_ignorable_key('.'.join((name, suffix)))
+    return new_name
 
 
-def _remove_loaded_suffix(name):
-    """
-    Remove loaded suffix to the given variable name
-    e.g. x@LOADED ==> x
-    """
-    suffix = core.loaded_var_suffix()
-    name = cpt.to_text(name)
-    return name.replace(suffix, '')
+@switch_to_static_graph
+def _generate_unique_var_name(prefix):
+    return unique_name.generate_with_ignorable_key(prefix)
 
 
 def _append_loaded_suffix_to_var(program_desc):
+    suffix_varname_dict = dict()
     persistable_vars = _get_persistable_vars(program_desc)
     for var_desc in persistable_vars:
         old_name = var_desc.name()
         new_name = _append_loaded_suffix(var_desc.name())
+        suffix_varname_dict[new_name] = old_name
         var_desc.set_name(new_name)
         for block_idx in six.moves.range(program_desc.num_blocks()):
             block = program_desc.block(block_idx)
@@ -141,6 +142,7 @@ def _append_loaded_suffix_to_var(program_desc):
                 op = block.op(op_idx)
                 op._rename_input(old_name, new_name)
                 op._rename_output(old_name, new_name)
+    return suffix_varname_dict
 
 
 @switch_to_static_graph
@@ -187,6 +189,9 @@ class _ProgramHolder(object):
         # execution scope
         self._inner_scope = core.Scope()
 
+        # append suffix var name dict
+        self._suffix_varname_dict = None
+
         # forward program
         self._infer_program_desc = self._preprocess(program_desc)
         # forward + backward program
@@ -272,7 +277,7 @@ class _ProgramHolder(object):
         self._append_scale_to_output(tmp_program)
 
         # 4. Persistable vars processing
-        # - append @LOADED suffix to persistable vars
+        # - append loaded suffix to persistable vars
         # NOTE: [why need to append suffix to persistable vars]
         # Dygraph and static graph mode use the same naming mechanism. 
         # If users want to load the model fine-tune, it is possible 
@@ -281,10 +286,7 @@ class _ProgramHolder(object):
         # and later after loading, a new linear is added. At this time, 
         # there will be a problem of duplicate names, so here is unified 
         # to add the LOADED suffix to the parameters of the model loaded
-        # during training. And in order to avoid multiple @LOADED suffix
-        # are appended to variable name, we only append @LOADED suffix to
-        # the variable that not contains @LOADED suffix.
-        _append_loaded_suffix_to_var(program_desc)
+        self._suffix_varname_dict = _append_loaded_suffix_to_var(program_desc)
         # - get persistable var
         self._persistable_names = _get_persistable_var_names(program_desc)
 
@@ -298,7 +300,7 @@ class _ProgramHolder(object):
             for i, out in enumerate(self._output_descs):
                 var = program.global_block().var(out.name())
                 var = nn.scale(
-                    var, 1., name="static_model_runner/scale_{}".format(i))
+                    var, 1., name="translated_layer/scale_{}".format(i))
                 scale_output_vars.append(var)
         # 2. update output names & descs
         for i, var in enumerate(scale_output_vars):
@@ -363,7 +365,7 @@ def _load_persistable_vars_by_program(model_path,
     persistable_vars = _get_persistable_vars(program_holder.infer_program)
     load_var_dict = {}
     for each_var in persistable_vars:
-        orig_each_name = _remove_loaded_suffix(each_var.name())
+        orig_each_name = program_holder._suffix_varname_dict[each_var.name()]
         if _is_parameter(each_var, program_holder.infer_program):
             # create output varbase
             new_var = framework.ParamBase(
@@ -376,7 +378,7 @@ def _load_persistable_vars_by_program(model_path,
             new_var = framework._varbase_creator(
                 type=each_var.type(),
                 name=each_var.name(),
-                shpae=each_var.shape(),
+                shape=each_var.shape(),
                 dtype=each_var.dtype(),
                 persistable=True)
         if params_filename is None:
@@ -421,6 +423,7 @@ def _load_persistable_vars_by_program(model_path,
 
 def _load_persistable_vars(model_path,
                            var_info_path,
+                           program_holder,
                            separate_params=False,
                            params_filename=None):
     # 1. load extra var info
@@ -430,10 +433,22 @@ def _load_persistable_vars(model_path,
     # 2. construct var dict
     load_var_dict = dict()
     load_var_list = []
-    # NOTE: some var may not be Parameter
-    for name in sorted(extra_var_info):
-        # append suffix, see [why need to append suffix to persistable vars]
-        new_name = _append_loaded_suffix(name)
+    inv_suffix_varname_dict = {
+        value: key
+        for key, value in program_holder._suffix_varname_dict.items()
+    }
+
+    # NOTE(chenweihang): we need load persistable vars based the program,
+    # because the program may be pruned when `save_inference_model`, some
+    # var in `extra_var_info` may have been pruned 
+    for name in sorted(inv_suffix_varname_dict):
+        if name not in extra_var_info:
+            raise RuntimeError(
+                "The model to be loaded is not complete."
+                "The variable `%s` of program cannot be found in loaded model.",
+                name)
+        # get suffix var name, see [why need to append suffix to persistable vars]
+        new_name = inv_suffix_varname_dict[name]
         # create output varbase
         if extra_var_info[name].get('trainable', None) is not None:
             # use default shape and dtype
@@ -506,7 +521,8 @@ def _construct_params_and_buffers(model_path,
     var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
-                                          separate_params, params_filename)
+                                          programs['forward'], separate_params,
+                                          params_filename)
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
@@ -620,20 +636,34 @@ class TranslatedLayer(layers.Layer):
             )
         if not isinstance(persistable_vars, dict):
             raise TypeError(
-                "TranslatedLayer need to use persisatbale variable dict for initialization."
+                "TranslatedLayer need to use persistable variable dict for initialization."
             )
 
         self._program_holder_dict = programs
 
-        for name, var in persistable_vars.items():
-            if isinstance(var, framework.ParamBase):
-                self.add_parameter(name, var)
-            elif isinstance(var, core.VarBase):
-                self.register_buffer(name, var)
-            else:
-                raise TypeError(
-                    "Adding persistent variable which  to layer is not supported now"
-                )
+        # NOTE(chenweihang): [ why not use var name directly? ]
+        # When add parameter or buffer to Layer by follow apis,
+        # the variable name can't contain `.`, beccause which may cause
+        # AttributeError when access the newly added parameter or buffer
+        # in the form of `self.**.**``, but the ParamBase or BarBase
+        # name contains `.` originally, such as `linear_0.w_0`, so here
+        # need to generate new var name for each var
+        self._persistable_var_name_dict = dict()
+        # the TranslatedLayer object holded var names count started from 0
+        with unique_name.guard():
+            for name, var in persistable_vars.items():
+                if isinstance(var, framework.ParamBase):
+                    dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
+                    self._persistable_var_name_dict[name] = dy_name
+                    self.add_parameter(dy_name, var)
+                elif isinstance(var, core.VarBase):
+                    dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX)
+                    self._persistable_var_name_dict[name] = dy_name
+                    self.register_buffer(dy_name, var)
+                else:
+                    raise TypeError(
+                        "Adding persistent variable which  to layer is not supported now"
+                    )
 
         self._is_test = True
 
@@ -655,7 +685,7 @@ class TranslatedLayer(layers.Layer):
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
-        # 2. load layer parameters & parameter attirbutes
+        # 2. load layer parameters & parameter attributes
         persistable_vars = _construct_params_and_buffers(
             model_path, programs, separate_params, params_filename)
 
@@ -700,10 +730,11 @@ class TranslatedLayer(layers.Layer):
 
             persistable_vars = []
             for var_name in program_holder.persistable_names:
-                if var_name in self._parameters:
-                    persistable_vars.append(self._parameters[var_name])
-                elif var_name in self._buffers:
-                    persistable_vars.append(self._buffers[var_name])
+                dy_var_name = self._persistable_var_name_dict[var_name]
+                if dy_var_name in self._parameters:
+                    persistable_vars.append(self._parameters[dy_var_name])
+                elif dy_var_name in self._buffers:
+                    persistable_vars.append(self._buffers[dy_var_name])
                 else:
                     raise ValueError(
                         "The persistable variable %s is not exists in current TranslatedLayer."
@@ -722,7 +753,7 @@ class TranslatedLayer(layers.Layer):
                                          core.VarDesc.VarType.STEP_SCOPES, True)
             tmp_scope_vec.value().set_scope(program_holder.scope)
 
-            # 2. run prorgam by op
+            # 2. run program by op
             trace_program = program_holder.infer_program if self._is_test else program_holder.train_program
             end_op_index = program_holder.infer_program.block(0).op_size()
             framework._dygraph_tracer().trace_op(
@@ -743,7 +774,7 @@ class TranslatedLayer(layers.Layer):
             # will be SelectedRows, not LoDTensor. But tracer will just
             # set param grad VarBase by forward VarBase(LoDTensor)
             # If we don't change grad_var type here, RunProgramOp need
-            # transform SelectedRows to LoDTensor forcely, it may not
+            # transform SelectedRows to LoDTensor forcibly, it may not
             # be user wanted result.
             for persistable_var in persistable_vars:
                 grad_var_name = var.name + core.grad_var_suffix()
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 8439b87dd9ced618ad4f0b2e6d9d321d5f8662be..853c16a5d0f7129f097f7fca860ab260f9dc7fd5 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -19,12 +19,13 @@ import pickle
 import warnings
 
 import six
+import paddle
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
@@ -33,7 +34,10 @@ from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dyg
 from paddle.fluid.framework import dygraph_only, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-__all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
+__all__ = [
+    'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
+    'set_verbosity'
+]
 
 
 def create_program_from_desc(program_desc):
@@ -128,17 +132,42 @@ def _dygraph_to_static_func_(dygraph_func):
 dygraph_to_static_func = wrap_decorator(_dygraph_to_static_func_)
 
 
-def _declarative_(dygraph_func):
+def copy_decorator_attrs(original_func, decorated_obj):
+    """
+    Copies some necessary attributes from original function into decorated function.
+
+    Args:
+        original_func(callable): the original decorated function.
+        decorated_obj(StaticLayer): the target decorated StaticLayer object.
+    """
+    decorator_name = "declarative"
+
+    decorated_obj.__name__ = original_func.__name__
+    decorated_obj._decorator_name = decorator_name
+    decorated_obj.__wrapped__ = original_func
+    decorated_obj.__doc__ = original_func.__doc__
+    if hasattr(original_func, "__module__"):
+        decorated_obj.__module__ = original_func.__module__
+
+    return decorated_obj
+
+
+def declarative(function=None, input_spec=None):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
     @declarative handles the Program and Executor of static mode and returns
-    the result as a dygraph VarBase.
+    the result as dygraph Tensor(s). Users could use the returned dygraph
+    Tensor(s) to do imperative training, inference, or other operations. If the
+    decorated function calls other imperative function, the called one will be
+    converted into declarative function as well.
 
     Args:
-        dygraph_func (callable): callable imperative function.
+        function (callable): callable imperative function.
+        input_spec(list[InputSpec]): list of InputSpec to specific the shape/dtype/name
+            information of each input Tensor.
 
     Returns:
-        VarBase: containing the numerical result.
+        Tensor(s): containing the numerical result.
 
     Examples:
         .. code-block:: python
@@ -147,6 +176,7 @@ def _declarative_(dygraph_func):
           import numpy as np
           from paddle.fluid.dygraph.jit import declarative
 
+          fluid.enable_dygraph()
 
           @declarative
           def func(x):
@@ -163,37 +193,27 @@ def _declarative_(dygraph_func):
 
     """
 
-    def __impl__(*args, **kwargs):
-        program_translator = ProgramTranslator()
-        if not program_translator.enable_declarative:
-            warnings.warn(
-                "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
-                "We will just return dygraph output.")
-            return dygraph_func(*args, **kwargs)
-        try:
-            return program_translator.get_output(dygraph_func, *args, **kwargs)
-        except Exception as e:
-            error_data = getattr(e, ERROR_DATA, None)
-            if error_data:
-                new_exception = error_data.create_exception()
-                if six.PY3:
-                    # NOTE(liym27):
-                    # 1. Why `raise new_exception from None`?
-                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
-                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
-                    #   caught exception.
-                    # 2. Use exec to bypass syntax error checking in Python 2.
-
-                    six.exec_("raise new_exception from None")
-                else:
-                    raise new_exception
-            else:
-                raise
+    def decorated(python_func):
+        """
+        Decorates a python function into a StaticLayer object.
+        """
+        # Step 1. unwrap the function if it is already decorated.
+        _, python_func = unwrap_decorators(python_func)
 
-    return __impl__
+        # Step 2. copy some attributes from original python function.
+        static_layer = copy_decorator_attrs(
+            original_func=python_func,
+            decorated_obj=StaticLayer(
+                function=python_func, input_spec=input_spec))
+
+        return static_layer
 
+    # for usage: `declarative(foo, ...)`
+    if function is not None:
+        return decorated(function)
 
-declarative = wrap_decorator(_declarative_)
+    # for usage: `@declarative`
+    return decorated
 
 
 class SaveLoadConfig(object):
@@ -335,7 +355,7 @@ class SaveLoadConfig(object):
                 # use SaveLoadconfig.output_spec
                 model_path = "simplenet.example.model.output_spec"
                 configs = fluid.dygraph.jit.SaveLoadConfig()
-                # only keep the predicted output in saved model, diccard loss
+                # only keep the predicted output in saved model, discard loss
                 configs.output_spec = [out]
 
                 fluid.dygraph.jit.save(
@@ -370,7 +390,7 @@ class SaveLoadConfig(object):
         The name of file to save the translated program of target Layer.
         Default filename is :code:`__model__` .
 
-        Exampels:
+        Examples:
             .. code-block:: python
 
                 import numpy as np
@@ -440,7 +460,7 @@ class SaveLoadConfig(object):
         The name of file to save all persistable variables in target Layer. 
         Default file name is :code:`__variables__` .
         
-        Exampels:
+        Examples:
             .. code-block:: python
 
                 import numpy as np
@@ -593,7 +613,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     The default saved translated program file name is ``__model__``,
     and the default saved persistable variables file name is ``__variables__``,
     and it also saved some additional variable description information to file 
-    ``__varibales.info__``, these additional information is used in fine-tuning.
+    ``__variables.info__``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
       - :ref:`api_imperative_jit_load`
@@ -603,7 +623,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     Args:
         layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
         model_path (str): the directory to save the model.
-        input_spec (list[Varibale], optional): Describes the input of the saved model. 
+        input_spec (list[Variable], optional): Describes the input of the saved model. 
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
@@ -701,11 +721,11 @@ def save(layer, model_path, input_spec=None, configs=None):
     prog_translator = ProgramTranslator()
     if not prog_translator.enable:
         raise RuntimeError(
-            "The paddle.imperative.jit.save doesn't work when setting ProgramTranslator.enable=False."
+            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable=False."
         )
     if not isinstance(layer, Layer):
         raise TypeError(
-            "The input layer of paddle.imperative.jit.save should be 'Layer', but received layer type is %s."
+            "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
     if configs is None:
@@ -717,16 +737,17 @@ def save(layer, model_path, input_spec=None, configs=None):
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
         for var in input_spec:
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, Variable,
+                                    paddle.static.InputSpec)):
                 raise TypeError(
-                    "The element in input_spec list should be 'Variable', but received element's type is %s."
+                    "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
     # 2. get program of declarative Layer.forward
-    prog_cache = prog_translator.get_program_cache()
-    # make dummy args & kwargs, to get excepted FunctionSpec
-    layer_func = FunctionSpec(type(layer).forward, [layer], {})
-    concrete_program, _ = prog_cache.get_program(layer_func)
+    if not isinstance(layer.forward, StaticLayer):
+        raise RuntimeError(
+            "layer.forward need to be decorated by `@declarative`.")
+    concrete_program = layer.forward.concrete_program
 
     # NOTE: we maintain the mapping of variable name to
     # structured name, the buffer variable (non-persistable)
@@ -810,7 +831,7 @@ def load(model_path, configs=None):
         For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
         there will be the following limitations when using it in fine-tuning:
         1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
-        2. All saved model's feed targets need to be passed into TranslatedLayer's forwrad function.
+        2. All saved model's feed targets need to be passed into TranslatedLayer's forward function.
         3. The variable's ``stop_gradient`` information is lost and can not be recovered.
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index f2e914a2137d0be0606556471696fd3d255b3c12..a904f80639752a7538289a1ce7c2abf378ccc634 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -136,18 +136,13 @@ class LayerObjectHelper(LayerHelperBase):
         return param
 
     # TODO: this should not be called anymore after all activation func move to Layers
-    def append_activation(self,
-                          input_var,
-                          act=None,
-                          use_cudnn=None,
-                          use_mkl_dnn=None):
+    def append_activation(self, input_var, act=None, use_cudnn=None):
         """Append activation
 
             Args:
                 input_var: the input variable. The len(input_var.shape) is
                 larger or equal than 2.
                 act: activation type
-                use_mkl_dnn: if use mkldnn
                 use_cudnn: if use cudnn
 
         Return the Variable of after append activation
@@ -163,8 +158,9 @@ class LayerObjectHelper(LayerHelperBase):
 
         if (use_cudnn is not None) and use_cudnn:
             act['use_cudnn'] = use_cudnn
-        if (use_mkl_dnn is not None) and use_mkl_dnn:
-            act['use_mkldnn'] = use_mkl_dnn
+        use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        if (use_mkldnn is not None) and use_mkldnn:
+            act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
 
         tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 72f105933dca919c8b3c2cbdf90318a5444d0866..1ef719b9da187be659d9c898ec996b5ad0c0d8a6 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -146,7 +146,7 @@ class Layer(core.Layer):
               import paddle
               import paddle.nn as nn
               
-              paddle.enable_imperative()
+              paddle.disable_static()
               
               net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
 
@@ -161,7 +161,7 @@ class Layer(core.Layer):
 
               print(net.state_dict())
         """
-        for layer in self.sublayers():
+        for layer in self.children():
             layer.apply(fn)
 
         fn(self)
@@ -283,7 +283,7 @@ class Layer(core.Layer):
     def create_parameter(self,
                          shape,
                          attr=None,
-                         dtype='float32',
+                         dtype=None,
                          is_bias=False,
                          default_initializer=None):
         """Create parameters for this layer.
@@ -353,6 +353,56 @@ class Layer(core.Layer):
         ]
         return ret
 
+    def children(self):
+        """Returns an iterator over immediate children layers.
+
+        Yields:
+            Layer: a child layer
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+
+                with fluid.dygraph.guard():
+                    fc1 = fluid.Linear(10, 3)
+                    fc2 = fluid.Linear(3, 10, bias_attr=False)
+                    model = fluid.dygraph.Sequential(fc1, fc2)
+                    
+                    layer_list = list(model.children())
+
+                    print(layer_list)
+
+        """
+        for _, layer in self.named_children():
+            yield layer
+
+    def named_children(self):
+        """Returns an iterator over immediate children layers, yielding both
+        the name of the layer as well as the layer itself.
+
+        Yields:
+            (string, Layer): Tuple containing a name and child layer
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+
+                with fluid.dygraph.guard():
+                    fc1 = fluid.Linear(10, 3)
+                    fc2 = fluid.Linear(3, 10, bias_attr=False)
+                    model = fluid.dygraph.Sequential(fc1, fc2)
+                    for prefix, layer in model.named_children():
+                        print(prefix, layer)
+
+        """
+        memo = set()
+        for name, layer in self._sub_layers.items():
+            if layer is not None and layer not in memo:
+                memo.add(layer)
+                yield name, layer
+
     def sublayers(self, include_sublayers=True):
         """Returns a list of sub layers.
 
@@ -503,7 +553,10 @@ class Layer(core.Layer):
                 "The name of buffer should be a string, but received {}.".
                 format(type(name).__name__))
         elif '.' in name:
-            raise KeyError("The name of buffer can not contain \".\"")
+            raise KeyError(
+                "The name of buffer can not contain `.`, "
+                "because when you access the newly added buffer in the "
+                "form of `self.**.**`, it will cause AttributeError.")
         elif name == '':
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
@@ -686,20 +739,38 @@ class Layer(core.Layer):
         Returns:
             Parameter: the parameter passed in.
         """
-        if parameter is None:
-            self._parameters[name] = None
-        elif not isinstance(parameter, framework.Parameter):
+        if '_parameters' not in self.__dict__:
+            raise RuntimeError(
+                "super(YourLayer, self).__init__() should be called firstly.")
+        elif not isinstance(name, six.string_types):
+            raise TypeError(
+                "The name of parameter should be a string, but received {}.".
+                format(type(name).__name__))
+        elif '.' in name:
+            raise KeyError(
+                "The name of parameter can not contain `.`, "
+                "because when you access the newly added parameter in the "
+                "form of `self.**.**`, it will cause AttributeError.")
+        elif name == '':
+            raise KeyError("The name of parameter can not be empty.")
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("The parameter '{}' already exists.".format(name))
+        elif parameter is not None and not isinstance(parameter,
+                                                      framework.Parameter):
             raise TypeError(
-                "parameter assignment requires Parameter or None, but got '{}'"
-                .format(type(parameter).__name__))
+                "The parameter to be added should be a Parameter, but received {}.".
+                format(type(parameter).__name__))
+        else:
+            if parameter is None:
+                self._parameters[name] = None
 
-        if len(self._loaddict_holder) > 0:
-            assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
-                parameter.name)
+            if len(self._loaddict_holder) > 0:
+                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name)
 
-            parameter.set_value(self._loaddict_holder[parameter.name])
+                parameter.set_value(self._loaddict_holder[parameter.name])
 
-        self._parameters[name] = parameter
+            self._parameters[name] = parameter
         return parameter
 
     def __getattr__(self, name):
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index d2c779a85497917179736777dac25efa7cfba228..bb55c6725e6a62f2cef393fd34b249c217be0c54 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -17,7 +17,9 @@ from __future__ import print_function
 from .. import core
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
+from ..layers import common_methods
 from . import to_variable, no_grad
+import paddle
 
 import numpy as np
 import six
@@ -30,6 +32,8 @@ _supported_int_dtype_ = [
     core.VarDesc.VarType.INT64,
 ]
 
+_already_patch_varbase = False
+
 
 def monkey_patch_math_varbase():
     """
@@ -37,7 +41,7 @@ def monkey_patch_math_varbase():
     The difference is, in dygraph mode, use auto-generated op functions for better performance.
     """
 
-    @no_grad
+    @no_grad()
     def create_tensor(value, dtype, shape):
         out = _varbase_creator(dtype=dtype)
         out = core.ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
@@ -140,25 +144,50 @@ def monkey_patch_math_varbase():
         else:
             return int(var.numpy().flatten()[0])
 
-    def _scalar_elementwise_add_(var, value):
+    @property
+    def _ndim_(var):
+        return len(var.shape)
+
+    def _scalar_add_(var, value):
         return _scalar_elementwise_op_(var, 1.0, value)
 
-    def _scalar_elementwise_sub_(var, value):
+    def _scalar_sub_(var, value):
         return _scalar_elementwise_op_(var, 1.0, -value)
 
-    def _scalar_elementwise_rsub_(var, value):
+    def _scalar_rsub_(var, value):
         return _scalar_elementwise_op_(var, -1.0, value)
 
-    def _scalar_elementwise_mul_(var, value):
+    def _scalar_mul_(var, value):
         return _scalar_elementwise_op_(var, value, 0.0)
 
-    def _scalar_elementwise_div_(var, value):
+    def _scalar_div_(var, value):
         return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
 
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
+    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
+    # for binary operator by using the api to achieve the type promotion
+    def _binary_method_creator_(op_type, reverse=False):
+        import paddle
+
+        def __impl__(self, other_var):
+            import paddle
+            op = getattr(paddle, op_type)
+            if reverse:
+                return op(other_var, self)
+            else:
+                return op(self, other_var)
+
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
+
+    # for binary operator such as elementwise, compare
+    def _binary_creator_(method_name,
+                         op_type,
+                         reverse=False,
+                         scalar_method=None):
         def __impl__(self, other_var):
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
@@ -200,60 +229,117 @@ def monkey_patch_math_varbase():
         __impl__.__doc__ = """
         {0}
         Args:
-            self(Variable): left hand variable
-            other_var(Variable|float|int): right hand variable
+            self(Tensor): left hand Tensor
+            other_var(Tensor|float|int): right hand Tensor
 
         Returns:
-            Variable
+            Tensor
         """.format(comment)
         __impl__.__name__ = method_name
         return __impl__
 
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-
-        setattr(core.VarBase, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    # b = -a
-    core.VarBase.__neg__ = _neg_
-    core.VarBase.__float__ = _float_
-    core.VarBase.__long__ = _long_
-    core.VarBase.__int__ = _int_
-    core.VarBase.__len__ = _len_
-    core.VarBase.__index__ = _index_
-    core.VarBase.astype = astype
-    """
-    When code is written like this
-    y = np.pi * var
-    ndarray.__mul__(self, var) is called, var will be traced as an array(by using __len__, __getitem__), which is not right.
-    when var.__array_ufunc__  is set to None, var.__rmul__(self,  np) will be called.
+    # Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
+    #  Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
+    def _method_creator_(op_type, arg_template=None):
+        def __impl__(self):
+            op = getattr(core.ops, op_type)
+            return op(self)
 
-    The details can be seen bellow:
-    https://docs.scipy.org/doc/numpy-1.13.0/neps/ufunc-overrides.html#behavior-in-combination-with-python-s-binary-operations
-    """
-    core.VarBase.__array_ufunc__ = None
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
+
+    varbase_methods = [
+        # Type1: From custom fun or lambda
+        ##   b=-a
+        ('__neg__', _neg_),
+        ('__float__', _float_),
+        ('__long__', _long_),
+        ('__int__', _int_),
+        ('__len__', _len_),
+        ('__index__', _index_),
+        ('astype', astype),
+        ('dim', lambda x: len(x.shape)),
+        ('ndimension', lambda x: len(x.shape)),
+        ('ndim', _ndim_),
+        ('size', lambda x: x.shape),
+        # Type2: From Template that create core.ops automatically. It's recommended.
+        ('__add__',
+         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
+        ##  a+b == b+a. Do not need to reverse explicitly
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+                                     _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+                                      _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+                                     _scalar_mul_)),
+        ## a*b == b*a. Do not need to reverse explicitly
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
+                                          None)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        # These binary use paddle.optype
+        ('__div__', _binary_method_creator_('divide', False)),
+        ('__truediv__', _binary_method_creator_('divide', False)),
+        ('__rtruediv__', _binary_method_creator_('divide', True)),
+        ('__rdiv__', _binary_method_creator_('divide', True)),
+        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
+        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
+        ('__mod__', _binary_method_creator_('remainder', False)),
+        ## for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
+        ('__array_ufunc__', None),
+        ('sigmoid', _method_creator_('sigmoid', 'name=None')),
+        ('logsigmoid', _method_creator_('logsigmoid', 'name=None')),
+        ('exp', _method_creator_('exp', 'name=None')),
+        ('tanh', _method_creator_('tanh', 'name=None')),
+        ('atan', _method_creator_('atan', 'name=None')),
+        ('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
+        ('sqrt', _method_creator_('sqrt', 'name=None')),
+        ('rsqrt', _method_creator_('rsqrt', 'name=None')),
+        ('abs', _method_creator_('abs', 'name=None')),
+        ('ceil', _method_creator_('ceil', 'name=None')),
+        ('floor', _method_creator_('floor', 'name=None')),
+        ('cos', _method_creator_('cos', 'name=None')),
+        ('acos', _method_creator_('acos', 'name=None')),
+        ('asin', _method_creator_('asin', 'name=None')),
+        ('sin', _method_creator_('sin', 'name=None')),
+        ('sinh', _method_creator_('sinh', 'name=None')),
+        ('cosh', _method_creator_('cosh', 'name=None')),
+        ('round', _method_creator_('round', 'name=None')),
+        ('reciprocal', _method_creator_('reciprocal', 'name=None')),
+        ('square', _method_creator_('square', 'name=None')),
+        ('softplus', _method_creator_('softplus', 'name=None')),
+        ('softsign', _method_creator_('softsign', 'name=None')),
+        # Type3: Form module 'paddle.tensor' defaultly.
+        #   It's not a goodway, because it will increase call time.
+    ]
+
+    global _already_patch_varbase
+    if not _already_patch_varbase:
+        for method in varbase_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(core.VarBase, method_name, method_impl)
+    else:
+        import paddle.tensor
+        for method_name in common_methods:
+            if hasattr(core.VarBase, method_name): continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl: setattr(core.VarBase, method_name, method_impl)
+
+    _already_patch_varbase = True
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 58efa58ac8b3ba80a7ddc293681f3183c44734ef..a14c3a81c121758ed90450cd5eb5990f3f7739e1 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 from six.moves import reduce
 from .. import core
 from ..layers import utils
@@ -30,6 +31,7 @@ from ..data_feeder import check_variable_and_dtype, check_type
 import numpy as np
 import numbers
 import logging
+import paddle.utils.deprecated as deprecated
 
 __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
@@ -180,6 +182,7 @@ class Conv2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
         self._use_cudnn = use_cudnn
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
         self._filter_size = filter_size
         self._num_filters = num_filters
         self._param_attr = param_attr
@@ -187,7 +190,8 @@ class Conv2D(layers.Layer):
         self._dtype = dtype
 
         if (self._num_channels == self._groups and
-                num_filters % self._num_channels == 0 and not self._use_cudnn):
+                num_filters % self._num_channels == 0 and
+                not self._use_cudnn and not self._use_mkldnn):
             self._l_type = 'depthwise_conv2d'
         else:
             self._l_type = 'conv2d'
@@ -224,14 +228,15 @@ class Conv2D(layers.Layer):
         if in_dygraph_mode() and self._l_type == 'conv2d':
             attrs = ('strides', self._stride, 'paddings', self._padding,
                      'dilations', self._dilation, 'groups', self._groups
-                     if self._groups else 1, 'use_cudnn', self._use_cudnn)
+                     if self._groups else 1, 'use_cudnn', self._use_cudnn,
+                     'use_mkldnn', self._use_mkldnn)
             out = core.ops.conv2d(input, self.weight, *attrs)
             pre_bias = out
 
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, self.bias,
-                                                            1)
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               self._act)
+            pre_act = dygraph_utils._append_bias_in_dygraph(
+                pre_bias, self.bias, 1, use_mkldnn=self._use_mkldnn)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, self._act, use_mkldnn=self._use_mkldnn)
         inputs = {
             'Input': [input],
             'Filter': [self.weight],
@@ -242,7 +247,7 @@ class Conv2D(layers.Layer):
             'dilations': self._dilation,
             'groups': self._groups if self._groups else 1,
             'use_cudnn': self._use_cudnn,
-            'use_mkldnn': False,
+            'use_mkldnn': self._use_mkldnn,
         }
 
         check_variable_and_dtype(input, 'input',
@@ -267,7 +272,8 @@ class Conv2D(layers.Layer):
                 inputs={'X': [pre_bias],
                         'Y': [self.bias]},
                 outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
+                attrs={'axis': 1,
+                       'use_mkldnn': self._use_mkldnn})
         else:
             pre_act = pre_bias
 
@@ -828,6 +834,8 @@ class Pool2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
                 "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
@@ -853,8 +861,8 @@ class Pool2D(layers.Layer):
                      'global_pooling', self._global_pooling, 'strides',
                      self._pool_stride, 'paddings', self._pool_padding,
                      'use_cudnn', self._use_cudnn, 'ceil_mode', self._ceil_mode,
-                     'use_mkldnn', False, 'exclusive', self._exclusive,
-                     'data_format', self._data_format)
+                     'use_mkldnn', self._use_mkldnn, 'exclusive',
+                     self._exclusive, 'data_format', self._data_format)
             return core.ops.pool2d(input, *attrs)
 
         check_variable_and_dtype(
@@ -869,7 +877,7 @@ class Pool2D(layers.Layer):
             "paddings": self._pool_padding,
             "use_cudnn": self._use_cudnn,
             "ceil_mode": self._ceil_mode,
-            "use_mkldnn": False,
+            "use_mkldnn": self._use_mkldnn,
             "exclusive": self._exclusive,
             "data_format": self._data_format,
         }
@@ -958,16 +966,22 @@ class Linear(layers.Layer):
         self.bias = self.create_parameter(
             shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True)
 
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+
     def forward(self, input):
         if in_dygraph_mode():
             pre_bias = _varbase_creator(dtype=input.dtype)
             core.ops.matmul(input, self.weight, pre_bias, 'transpose_X', False,
-                            'transpose_Y', False, "alpha", 1)
+                            'transpose_Y', False, "alpha", 1, "use_mkldnn",
+                            self._use_mkldnn)
             pre_act = dygraph_utils._append_bias_in_dygraph(
-                pre_bias, self.bias, axis=len(input.shape) - 1)
+                pre_bias,
+                self.bias,
+                axis=len(input.shape) - 1,
+                use_mkldnn=self._use_mkldnn)
 
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               self._act)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, self._act, use_mkldnn=self._use_mkldnn)
 
         check_variable_and_dtype(input, 'input',
                                  ['float16', 'float32', 'float64'], "Linear")
@@ -976,6 +990,7 @@ class Linear(layers.Layer):
             "transpose_X": False,
             "transpose_Y": False,
             "alpha": 1,
+            "use_mkldnn": self._use_mkldnn,
         }
         inputs = {"X": [input], "Y": [self.weight]}
 
@@ -990,7 +1005,10 @@ class Linear(layers.Layer):
                 inputs={'X': [tmp],
                         'Y': [self.bias]},
                 outputs={'Out': [pre_activation]},
-                attrs={'axis': len(input.shape) - 1})
+                attrs={
+                    'axis': len(input.shape) - 1,
+                    'use_mkldnn': self._use_mkldnn
+                })
         else:
             pre_activation = tmp
         return self._helper.append_activation(pre_activation, act=self._act)
@@ -1250,6 +1268,7 @@ class BatchNorm(layers.Layer):
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._act = act
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
@@ -1314,8 +1333,8 @@ class BatchNorm(layers.Layer):
         if in_dygraph_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
-                     self._fuse_with_relu, "use_global_stats",
+                     self._data_layout, "use_mkldnn", self._use_mkldnn,
+                     "fuse_with_relu", self._fuse_with_relu, "use_global_stats",
                      self._use_global_stats, 'trainable_statistics',
                      self._trainable_statistics)
             batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
@@ -1323,7 +1342,7 @@ class BatchNorm(layers.Layer):
                 mean_out, variance_out, *attrs)
 
             return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act)
+                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn)
 
         check_variable_and_dtype(input, 'input',
                                  ['float16', 'float32', 'float64'], 'BatchNorm')
@@ -2308,7 +2327,8 @@ class PRelu(layers.Layer):
             #NOTE(zhiqiu): The _alpha_shape should be [1, channel] + [1] * len(input_shape[2:]), not [1, channel, 1, 1].
             # However, the suffix 1 in the list is useless, since the tensor is viewed as one demension array during kernel calculation. 
             # And, input_shape is not required when mode is 'channel', so it is simplified.
-            self._alpha_shape = [1, channel]
+            #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+            self._alpha_shape = [1, channel, 1, 1]
         elif mode == 'element':
             assert isinstance(input_shape, (
                 list, tuple
@@ -2426,6 +2446,10 @@ class BilinearTensorProduct(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
 
+    @deprecated(
+        since="2.0.0",
+        update_to="paddle.nn.Bilinear",
+        reason="New name and new args in Bilinear, easier to use.")
     def forward(self, x, y):
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'BilinearTensorProduct')
@@ -3206,12 +3230,12 @@ class Flatten(layers.Layer):
         .. code-block:: python
 
           import paddle
-          from paddle.imperative import to_variable
+          from paddle import to_variable
           import numpy as np
 
           inp_np = np.ones([5, 2, 3, 4]).astype('float32')
           
-          paddle.enable_imperative()
+          paddle.disable_static()
           
           inp_np = to_variable(inp_np)
           flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
@@ -3225,19 +3249,6 @@ class Flatten(layers.Layer):
         self.stop_axis = stop_axis
 
     def forward(self, input):
-        out = self._helper.create_variable_for_type_inference(input.dtype)
-        x_shape = self._helper.create_variable_for_type_inference(input.dtype)
-
-        if in_dygraph_mode():
-            dy_out, _ = core.ops.flatten_contiguous_range(
-                input, 'start_axis', self.start_axis, 'stop_axis',
-                self.stop_axis)
-            return dy_out
-        self._helper.append_op(
-            type="flatten_contiguous_range",
-            inputs={"X": input},
-            outputs={"Out": out,
-                     "XShape": x_shape},
-            attrs={"start_axis": self.start_axis,
-                   "stop_axis": self.stop_axis})
+        out = paddle.tensor.manipulation.flatten(
+            input, start_axis=self.start_axis, stop_axis=self.stop_axis)
         return out
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 804076f608e714b4c2623bfb580bfe09e42c8db2..54d2cda4ca6858c46140e1fbf6ac8860c3a7c78d 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -242,41 +242,38 @@ class DataParallel(layers.Layer):
     Examples:
         .. code-block:: python
 
-           import numpy as np
-           import paddle.fluid as fluid
-           import paddle.fluid.dygraph as dygraph
-           from paddle.fluid.optimizer import AdamOptimizer
-           from paddle.fluid.dygraph.nn import Linear
-           from paddle.fluid.dygraph.base import to_variable
+            import numpy as np
+            import paddle.fluid as fluid
 
-           place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-           with fluid.dygraph.guard(place=place):
+            place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+            with fluid.dygraph.guard(place):
 
-               # prepare the data parallel context
-               strategy=dygraph.prepare_context()
+                # prepare the data parallel context
+                strategy = fluid.dygraph.prepare_context()
 
-               linear = Linear(1, 10, act="softmax")
-               adam = fluid.optimizer.AdamOptimizer()
+                linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                adam = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, parameter_list=linear.parameters())
 
-               # make the module become the data parallelism module
-               linear = dygraph.DataParallel(linear, strategy)
+                # make the module become the data parallelism module
+                linear = fluid.dygraph.DataParallel(linear, strategy)
 
-               x_data = np.random.random(size=[10, 1]).astype(np.float32)
-               data = to_variable(x_data)
+                x_data = np.random.random(size=[10, 1]).astype(np.float32)
+                data = fluid.dygraph.to_variable(x_data)
 
-               hidden = linear(data)
-               avg_loss = fluid.layers.mean(hidden)
+                hidden = linear(data)
+                avg_loss = fluid.layers.mean(hidden)
 
-               # scale the loss according to the number of trainers.
-               avg_loss = linear.scale_loss(avg_loss)
+                # scale the loss according to the number of trainers.
+                avg_loss = linear.scale_loss(avg_loss)
 
-               avg_loss.backward()
+                avg_loss.backward()
 
-               # collect the gradients of trainers.
-               linear.apply_collective_grads()
+                # collect the gradients of trainers.
+                linear.apply_collective_grads()
 
-               adam.minimize(avg_loss)
-               linear.clear_gradients()
+                adam.minimize(avg_loss)
+                linear.clear_gradients()
     """
 
     def __init__(self, layers, strategy):
@@ -306,20 +303,23 @@ class DataParallel(layers.Layer):
 
                 import numpy as np
                 import paddle.fluid as fluid
-                import paddle.fluid.dygraph as dygraph
-                from paddle.fluid.optimizer import AdamOptimizer
-                from paddle.fluid.dygraph.nn import Linear
-                from paddle.fluid.dygraph.base import to_variable
-
-                place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place=place):
-                    strategy=dygraph.prepare_context()
-                    linear = Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer()
-                    linear = dygraph.DataParallel(linear, strategy)
+
+                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+                with fluid.dygraph.guard(place):
+
+                    # prepare the data parallel context
+                    strategy = fluid.dygraph.prepare_context()
+
+                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                    adam = fluid.optimizer.AdamOptimizer(
+                        learning_rate=0.001, parameter_list=linear.parameters())
+
+                    # make the module become the data parallelism module
+                    linear = fluid.dygraph.DataParallel(linear, strategy)
 
                     x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = to_variable(x_data)
+                    data = fluid.dygraph.to_variable(x_data)
+
                     hidden = linear(data)
                     avg_loss = fluid.layers.mean(hidden)
 
@@ -327,6 +327,8 @@ class DataParallel(layers.Layer):
                     avg_loss = linear.scale_loss(avg_loss)
 
                     avg_loss.backward()
+
+                    # collect the gradients of trainers.
                     linear.apply_collective_grads()
 
                     adam.minimize(avg_loss)
@@ -380,7 +382,7 @@ class DataParallel(layers.Layer):
                 self._reshape_inplace(x=g_var, shape=g_shape)
                 assert g_var.shape == g_shape
 
-    @no_grad
+    @no_grad()
     def apply_collective_grads(self):
         """
         AllReduce the Parameters' gradient.
@@ -390,23 +392,29 @@ class DataParallel(layers.Layer):
 
                 import numpy as np
                 import paddle.fluid as fluid
-                import paddle.fluid.dygraph as dygraph
-                from paddle.fluid.optimizer import AdamOptimizer
-                from paddle.fluid.dygraph.nn import Linear
-                from paddle.fluid.dygraph.base import to_variable
-
-                place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place=place):
-                    strategy=dygraph.prepare_context()
-                    linear = Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer()
-                    linear = dygraph.DataParallel(linear, strategy)
+
+                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+                with fluid.dygraph.guard(place):
+
+                    # prepare the data parallel context
+                    strategy = fluid.dygraph.prepare_context()
+
+                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                    adam = fluid.optimizer.AdamOptimizer(
+                        learning_rate=0.001, parameter_list=linear.parameters())
+
+                    # make the module become the data parallelism module
+                    linear = fluid.dygraph.DataParallel(linear, strategy)
 
                     x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = to_variable(x_data)
+                    data = fluid.dygraph.to_variable(x_data)
+
                     hidden = linear(data)
                     avg_loss = fluid.layers.mean(hidden)
+
+                    # scale the loss according to the number of trainers.
                     avg_loss = linear.scale_loss(avg_loss)
+
                     avg_loss.backward()
 
                     # collect the gradients of trainers.
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index d509fcc38e771bf5a5bacb63602966a871c7c885..7cb17843396a6ed79c36126172a253864dbf3d0f 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -15,7 +15,6 @@
 import inspect
 from .. import framework
 from .. import core
-from . import BackwardStrategy
 from ..framework import Variable, Parameter, ParamBase
 from .base import switch_to_static_graph
 import numpy as np
@@ -50,14 +49,19 @@ def monkey_patch_varbase():
                     static_var = var_base._to_static_var()
 
         """
+
+        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. 
+        # It will fail. So, for propery in dygraph only, should not let it getattr(self, attr, None).
+        attr_not_need_keys = ['grad']
         if isinstance(self, ParamBase):
             attr_kwargs = self.__dict__.copy()
         else:
-            attr_names = [
-                name for name in dir(self)
-                if not (inspect.ismethod(getattr(self, name)) or
-                        name.startswith('_'))
-            ]
+            attr_names = []
+            for name in dir(self):
+                if name not in attr_not_need_keys and not (
+                        inspect.ismethod(getattr(self, name)) or
+                        name.startswith('_')):
+                    attr_names.append(name)
             attr_kwargs = {name: getattr(self, name) for name in attr_names}
 
         attr_keys = ['block', 'shape', 'dtype', 'type', 'name', 'persistable']
@@ -124,19 +128,18 @@ def monkey_patch_varbase():
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, backward_strategy=None, retain_graph=False):
+    def backward(self, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
 
-        Run backward of current Graph which starts from current Variable
+        Run backward of current Graph which starts from current Tensor.
 
         Args:
-            backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
-            like to add more ops to the built graph after calling this method(`backward`), set the parameter
-            `retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
-            Defaults to False.
+                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+                Defaults to False.
 
         Returns:
             NoneType: None
@@ -144,32 +147,25 @@ def monkey_patch_varbase():
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
                 import numpy as np
+                import paddle
+                paddle.disable_static()
 
                 x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = fluid.dygraph.base.to_variable(x)
-                        # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                        # there is no one need gradient on it.
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.to_tensor(x)
+                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                    # there is no one need gradient on it.
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
 
         """
         if framework.in_dygraph_mode():
-            if backward_strategy is None:
-                backward_strategy = BackwardStrategy()
-                backward_strategy.sort_sum_gradient = False
-
-            self._run_backward(backward_strategy,
-                               framework._dygraph_tracer(), retain_graph)
+            self._run_backward(framework._dygraph_tracer(), retain_graph)
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -200,9 +196,7 @@ def monkey_patch_varbase():
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
 
         """
@@ -216,6 +210,14 @@ def monkey_patch_varbase():
         else:
             return np.array(new_ivar.value().get_tensor())
 
+    @property
+    def grad(self):
+        """
+        The alias of gradient().
+        """
+
+        return self.gradient()
+
     def __str__(self):
         """
         Convert a VarBase object to a readable string.
@@ -226,7 +228,7 @@ def monkey_patch_varbase():
             .. code-block:: python
 
                 import paddle
-                paddle.enable_imperative()
+                paddle.disable_static()
                 x = paddle.rand([1, 5])
                 print(x)
                 # Variable: eager_tmp_0
@@ -235,13 +237,13 @@ def monkey_patch_varbase():
                 #   - layout: NCHW
                 #   - dtype: float
                 #   - data: [0.645307 0.597973 0.732793 0.646921 0.540328]
-                paddle.disable_imperative()
+                paddle.enable_static()
         """
         tensor = self.value().get_tensor()
         if tensor._is_initialized():
-            return 'Variable: %s\n%s' % (self.name, str(tensor))
+            return 'Tensor: %s\n%s' % (self.name, str(tensor))
         else:
-            return 'Variable: %s, not initialized' % (self.name)
+            return 'Tensor: %s, not initialized' % (self.name)
 
     @property
     def block(self):
@@ -260,8 +262,9 @@ def monkey_patch_varbase():
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
-        ("block", block), ("backward", backward), ("gradient", gradient),
-        ("__str__", __str__)):
+        ("block", block), ("backward", backward), ("grad", grad),
+        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
+        ("__module__", "paddle"), ("__name__", "Tensor")):
         setattr(core.VarBase, method_name, method)
 
     # patch math methods for varbase
diff --git a/python/paddle/fluid/dygraph_utils.py b/python/paddle/fluid/dygraph_utils.py
index 7b559494e6c3b779983e54f5f9675170ef985f63..a2338b874f51a209cf941d8c08d5995db4054968 100644
--- a/python/paddle/fluid/dygraph_utils.py
+++ b/python/paddle/fluid/dygraph_utils.py
@@ -45,17 +45,19 @@ def _append_activation_in_dygraph(input,
 
 
 @dygraph_only
-def _append_bias_in_dygraph(input, bias=None, axis=1):
+def _append_bias_in_dygraph(input, bias=None, axis=1, use_mkldnn=False):
     """Append bias operation in dygraph mode.
 
         Args:
             input: the input variable. 
             bias:  the bias to be appended
             axis:  the axis to perform operation
+            use_mkldnn: whether to use mkldnn
 
     Return the Variable after bias operation
     """
     if bias is None:
         return input
 
-    return core.ops.elementwise_add(input, bias, 'axis', axis)
+    return core.ops.elementwise_add(input, bias, 'axis', axis, 'use_mkldnn',
+                                    use_mkldnn)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 27a59e76593ec2c456bb63cb1defa4e1d1f3e77c..2e3f34f41648a9343b4bccd1044bcd3f7b3d8189 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -31,6 +31,7 @@ from .. import compat as cpt
 from .trainer_factory import TrainerFactory
 from .trainer_factory import FetchHandlerMonitor
 import copy
+from . import framework
 from .incubate.checkpoint import auto_checkpoint as acp
 
 __all__ = ['Executor', 'global_scope', 'scope_guard']
@@ -109,7 +110,7 @@ def scope_guard(scope):
         _switch_scope(ex)
 
 
-def as_numpy(tensor):
+def as_numpy(tensor, copy=False):
     """
     Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
     For higher dimensional sequence data, please use LoDTensor directly.
@@ -128,6 +129,7 @@ def as_numpy(tensor):
 
     Args:
        tensor(Variable): a instance of Tensor
+       copy(bool, optional): Whether to use deep copy.
 
     Returns:
         numpy.ndarray
@@ -144,7 +146,10 @@ def as_numpy(tensor):
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
     if tensor._is_initialized():
-        return np.array(tensor)
+        if copy:
+            return np.array(tensor)
+        else:
+            return np.asarray(tensor)
     else:
         return None
 
@@ -349,7 +354,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
         " program.")
     tensor = var.get_tensor()
     if return_numpy:
-        tensor = as_numpy(tensor)
+        tensor = as_numpy(tensor, copy=True)
     return tensor
 
 
@@ -544,10 +549,8 @@ class Executor(object):
 
     def __init__(self, place=None):
         if place is None:
-            if core.is_compiled_with_cuda():
-                self.place = core.CUDAPlace(0)
-            else:
-                self.place = core.CPUPlace()
+            expected_place = framework._current_expected_place()
+            self.place = expected_place
         else:
             self.place = place
         self.program_caches = dict()
@@ -851,6 +854,7 @@ class Executor(object):
 
     def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                       return_numpy, return_merged):
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         exe = program._executor
         # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
         # instead of program. We will add support for checking Vars in Graph
@@ -894,6 +898,16 @@ class Executor(object):
                 res.append(res_dict)
             exe.feed_tensors_into_local_scopes(res)
 
+        if hasattr(program._program, 'lr_sheduler'):
+            lr_sheduler = program._program.lr_sheduler
+            assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program._program.global_block().vars[lr_sheduler._var_name]
+            lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
+            exe.feed_and_split_tensor_into_local_scopes({
+                lr_sheduler._var_name: lr_tensor
+            })
+
         fetch_var_names = list(map(_to_name_str, fetch_list))
         tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
         return as_numpy(tensors) if return_numpy else tensors
@@ -1157,6 +1171,26 @@ class Executor(object):
 
         compiled = isinstance(program, compiler.CompiledProgram)
 
+        # Check if fluid.data() variable no feed data
+        if use_prune:
+            if compiled:
+                global_block = program._program.global_block()
+            else:
+                global_block = program.global_block()
+            for varname in global_block.vars:
+                vardesc = global_block.desc.find_var(cpt.to_bytes(varname))
+                varobj = global_block.vars[varname]
+
+                # Can not check var build by fluid.layers.data(), bucause fluid.layers.data() had not set need_check_feed
+                if vardesc.persistable() == False and \
+                    vardesc.type() == core.VarDesc.VarType.LOD_TENSOR and \
+                    vardesc.need_check_feed() == True and \
+                    varobj._stop_gradient == True and \
+                    varobj.is_data == True and \
+                    varobj.belong_to_optimizer == False and \
+                    varname not in feed:
+                    raise ValueError('Need feed data for variable %s' % varname)
+
         acp._auto_checkpoint(self, program)
 
         # For backward compatibility, run directly.
@@ -1203,7 +1237,7 @@ class Executor(object):
 
     def _run_program(self, program, feed, fetch_list, feed_var_name,
                      fetch_var_name, scope, return_numpy, use_program_cache):
-
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         if feed is None:
             feed = {}
         elif isinstance(feed, (list, tuple)):
@@ -1259,6 +1293,16 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
+        if hasattr(program, 'lr_sheduler'):
+            assert isinstance(program.lr_sheduler,
+                              _LRScheduler), "must be _LRScheduler"
+            lr_sheduler = program.lr_sheduler
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
         if not use_program_cache:
             self._default_executor.run(program.desc, scope, 0, True, True,
                                        fetch_var_name)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 85d4a73b35cf98083b5cafa67546eb974d1088a8..fc4e91aad4fff1db325e17828d26ccd94c164c3d 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -48,6 +48,7 @@ __all__ = [
     'cuda_pinned_places',
     'in_dygraph_mode',
     'is_compiled_with_cuda',
+    'is_compiled_with_xpu',
     'Variable',
     'ComplexVariable',
     'load_op_library',
@@ -64,7 +65,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _dygraph_tracer_ = None
-_dygraph_current_expected_place_ = None
+_global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
 
@@ -247,7 +248,26 @@ def _dygraph_tracer():
 
 
 def _current_expected_place():
-    return _dygraph_current_expected_place_
+    global _global_expected_place_
+    if _global_expected_place_ is None:
+        if core.is_compiled_with_cuda():
+            _global_expected_place_ = core.CUDAPlace(0)
+        else:
+            _global_expected_place_ = core.CPUPlace()
+
+    return _global_expected_place_
+
+
+def _set_dygraph_tracer_expected_place(place):
+    global _dygraph_tracer_
+    if _dygraph_tracer_ is not None:
+        _dygraph_tracer_._expected_place = place
+
+
+def _set_expected_place(place):
+    global _global_expected_place_
+    _global_expected_place_ = place
+    _set_dygraph_tracer_expected_place(place)
 
 
 # TODO(zhiqiu): remove this function.
@@ -291,6 +311,21 @@ def _cuda_ids():
     return device_ids
 
 
+def is_compiled_with_xpu():
+    """
+    Whether this whl package can be used to run the model on XPU.
+
+    Returns (bool): support xpu or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            support_xpu = fluid.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
 def is_compiled_with_cuda():
     """
     Whether this whl package can be used to run the model on GPU.
@@ -1071,15 +1106,18 @@ class Variable(object):
         pass
 
     @fake_interface_only
-    def backward(self, backward_strategy=None):
+    def backward(self, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
 
-        Run backward of current Graph which starts from current Variable
+        Run backward of current Graph which starts from current Tensor.
 
         Args:
-            backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
+            retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+                Defaults to False.
 
         Returns:
             NoneType: None
@@ -1087,23 +1125,21 @@ class Variable(object):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
                 import numpy as np
+                import paddle
+                paddle.disable_static()
 
                 x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = fluid.dygraph.base.to_variable(x)
-                        # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                        # there is no one need gradient on it.
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.to_tensor(x)
+                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                    # there is no one need gradient on it.
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
 
         """
         pass
@@ -1135,9 +1171,7 @@ class Variable(object):
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
 
                 # example2: return tuple of ndarray
@@ -1183,9 +1217,7 @@ class Variable(object):
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
                     loss2.clear_gradient()
                     print("After clear {}".format(loss2.gradient()))
@@ -1689,34 +1721,40 @@ def get_all_op_protos():
 
 class ComplexVariable(object):
     """
-    The Variable defined on the complex number domain. It contains two common 
-    real number Variables as its members, :attr:`real` and :attr:`imag` 
+    The ComplexTensor defined on the complex number domain. It contains two common 
+    real number Tensor as its members, :attr:`real` and :attr:`imag` 
     holding the real part and imaginary part of complex numbers respectively.
     
     **Notes**:
-        **The constructor of ComplexVariable should not be invoked directly.**
+        **The constructor of ComplexTensor should not be invoked directly.**
 
-        **Only support dygraph mode at present. Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph ComplexVariable with complex number data.**
+        **Only support dygraph mode at present. Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph ComplexTensor with complex number data.**
 
     Args:
-        real (Variable): The Variable holding real-part data.
-        imag (Variable): The Variable holding imaginery-part data.
+        real (Tensor): The Tensor holding real-part data.
+        imag (Tensor): The Tensor holding imaginery-part data.
     
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            a = np.array([1.0+2.0j, 0.2])
-            with fluid.dygraph.guard():
-                var = fluid.dygraph.to_variable(a, name="new_var")
-                print(var.name, var.dtype, var.shape)
-                # ({'real': u'new_var.real', 'imag': u'new_var.imag'}, 'complex128', [2L]) 
-                print(var.numpy())
-                # [1. +2.j 0.2+0.j]
+            paddle.enable_imperative()
+            x = paddle.to_tensor([1.0+2.0j, 0.2])
+            print(x.name, x.dtype, x.shape)
+            # ({'real': 'generated_tensor_0.real', 'imag': 'generated_tensor_0.imag'}, 'complex128', [2L])
+            print(x.numpy())
+            # [1. +2.j 0.2+0.j]
+            print(type(x))
+            # <class 'paddle.ComplexTensor'>
     """
 
+    def __new__(cls, *arg, **kwargs):
+        cls.__module__ = "paddle"
+        cls.__name__ = "ComplexTensor"
+        return super(ComplexVariable, cls).__new__(cls)
+
     def __init__(self, real, imag):
         assert real.shape == imag.shape, "The real part and imaginary part " \
             "of a ComplexVariable should have the same shape!"
@@ -1763,7 +1801,9 @@ class ComplexVariable(object):
         return self.real.numpy() + 1j * self.imag.numpy()
 
     def __str__(self):
-        return "REAL: " + self.real.__str__() + "IMAG: " + self.imag.__str__()
+        return "ComplexTensor[real]: %s\n%s\nComplexTensor[imag]: %s\n%s" % (
+            self.real.name, str(self.real.value().get_tensor()), self.imag.name,
+            str(self.imag.value().get_tensor()))
 
     __repr__ = __str__
 
@@ -4407,6 +4447,8 @@ class Program(object):
             p._current_role = self._current_role
             p.__op_role_var = self.__op_role_var
             p._appending_grad_times = self._appending_grad_times
+            if hasattr(self, 'lr_sheduler'):
+                p.lr_sheduler = self.lr_sheduler
 
             #NOTE(zhiqiu): we sync the cloned program, to update its program by
             # its desc.
@@ -5092,12 +5134,13 @@ class Parameter(Variable):
 
 class ParamBase(core.VarBase):
     """
-    ParamBase is derived from VarBase( Which is the Variable in Dygraph Mode ). A ParamBase is a persistable
-    VarBase, and will be updated by optimizers after each iteration.
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
+    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    after each iteration.
     The training of a neural network is essentially the updating of
     its ParamBase.
 
-    Relative to a general Variable, a ParamBase has several its own
+    Relative to a general Tensor, a ParamBase has several its own
     member variables:
 
     Args:
@@ -5175,7 +5218,7 @@ class ParamBase(core.VarBase):
             .. code-block:: python
 
                 import paddle
-                paddle.enable_imperative()
+                paddle.disable_static()
                 conv = paddle.nn.Conv2D(3, 3, 5)
                 print(conv.weight)
                 # Parameter: conv2d_0.w_0
@@ -5184,13 +5227,10 @@ class ParamBase(core.VarBase):
                 #   - layout: NCHW
                 #   - dtype: float
                 #   - data: [...] 
-                paddle.disable_imperative()
+                paddle.enable_static()
         """
-        tensor = self.value().get_tensor()
-        if tensor._is_initialized():
-            return 'Parameter: %s\n%s' % (self.name, str(tensor))
-        else:
-            return 'Parameter: %s, not initialized' % (self.name)
+        return "Parameter containing:\n  {}\n  - stop_gradient: {}".format(
+            super(ParamBase, self).__str__(), self.stop_gradient)
 
     __repr__ = __str__
 
@@ -5411,14 +5451,14 @@ def _dygraph_guard(tracer):
 
 @signature_safe_contextmanager
 def _dygraph_place_guard(place):
-    global _dygraph_current_expected_place_
-    tmp_place = _dygraph_current_expected_place_
-    _dygraph_current_expected_place_ = place
+    global _global_expected_place_
+    tmp_place = _global_expected_place_
+    _global_expected_place_ = place
 
     try:
         yield
     finally:
-        _dygraph_current_expected_place_ = tmp_place
+        _global_expected_place_ = tmp_place
 
 
 def load_op_library(lib_filename):
diff --git a/python/paddle/fluid/generator.py b/python/paddle/fluid/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e11b2e484dce1dd4260b3052d0f0a58f3cfc420a
--- /dev/null
+++ b/python/paddle/fluid/generator.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is definition of generator class, which is for managing the state of the algorithm that produces pseudo random numbers."""
+
+from . import core
+
+__all__ = ['Generator']
+
+default_rng_seed_val = 34342423252
+
+
+class Generator(object):
+    """Generator class"""
+
+    def __init__(self, device="CPU"):
+        """init"""
+        self.device = device
+        seed_in = default_rng_seed_val
+        if self.device == "CPU":
+            self.generator = core.Generator()
+            # self.generator.manual_seed(seed_in)
+        else:
+            raise ValueError(
+                "generator class with device %s does not exist, currently only support generator with device 'CPU' "
+                % device)
+
+    def get_state(self):
+        return self.generator.get_state()
+
+    def set_state(self, state):
+        self.generator.set_state(state)
+
+    def manual_seed(self, seed):
+        self.generator.manual_seed(seed)
+
+    def seed(self):
+        return self.generator.seed()
+
+    def initial_seed(self):
+        return self.generator.initial_seed()
+
+    def random(self):
+        return self.generator.random()
+
+    def get_cpu_engine(self):
+        return self.generator.get_cpu_engine()
+
+    def set_cpu_engine(self, cpu_engine):
+        self.generator.set_cpu_engine(cpu_engine)
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
index 094aa842d060d1eec411b545dd544485cd5f0e11..ad51a043a0a50f89f77811adb7f95759a4f220be 100644
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -24,7 +24,6 @@ from threading import Thread, current_thread
 from contextlib import contextmanager
 
 from paddle.fluid import unique_name, compiler
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
 from .checkpoint_saver import SerializableBase, CheckpointSaver, PaddleModel
 from paddle.fluid.framework import in_dygraph_mode, Program
 
@@ -306,6 +305,7 @@ class TrainEpochRange(SerializableBase):
         if self._checker.ce_test:
             config = None
 
+        from paddle.distributed.fleet.utils.fs import HDFSClient
         self._hdfs = HDFSClient(self._checker.hdfs_home, config)
 
         self._cper = CheckpointSaver(self._hdfs)
diff --git a/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
index da94f8cd14a939d03567a146753832a75fc6ea0d..08400ab13a25dd0f460bb3aedc30936bbca0d83a 100644
--- a/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+++ b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fleet.utils.fs import FS, LocalFS
-from ..fleet.utils.hdfs import HDFSClient
 from ...compiler import CompiledProgram
 
 
@@ -81,6 +79,7 @@ class CheckpointSaver(object):
         tmp_path = "{}.tmp".format(real_path)
         saved_path = tmp_path
 
+        from paddle.distributed.fleet.utils.fs import LocalFS
         local_fs = LocalFS()
 
         cache_path = None
@@ -121,7 +120,6 @@ class CheckpointSaver(object):
         Deserialize objects in slists from path
         Return really load path
         """
-
         if checkpoint_no is None:
             max_no = self._get_last_checkpoint_no(path)
 
@@ -136,6 +134,7 @@ class CheckpointSaver(object):
             assert isinstance(checkpoint_no, int)
             assert checkpoint_no >= 0
 
+        from paddle.distributed.fleet.utils.fs import LocalFS
         local_fs = LocalFS()
         if self._fs.need_upload_download():
             cache_path = "{}/{}.{}.load_cache".format(
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index f236a3e98c61bade5804e7a91978352174a9c5b2..f885e51ef7f0d82ca50c7beb6ee6cd443dfc61d4 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -21,7 +21,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGD
 
 from paddle.fluid.incubate.fleet.base.mode import Mode
-from paddle.fleet.base.role_maker import RoleMakerBase
+from paddle.distributed.fleet.base.role_maker import RoleMakerBase
 from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision
 from . import mode
 
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index 6ad6f61e2d1c1b4e2522f20cfd73eb923b5ffc2c..6e5aae82517d1e0f408ebd7311e1c77a86fe426f 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -26,7 +26,6 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
 
 from paddle.fluid import compiler
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel, CheckpointSaver
 
 import os
@@ -143,14 +142,13 @@ class Collective(Fleet):
                         path,
                         trainer_id,
                         train_status,
+                        fs,
                         main_program=None,
-                        fs=LocalFS(),
                         local_cache_path=".cache",
                         remain_all_checkpoint=True):
         """
         This function save persistables and current epoch num to path.
         """
-
         if main_program == None:
             main_program = self._transpiled_program
 
@@ -173,8 +171,8 @@ class Collective(Fleet):
                         path,
                         trainer_id,
                         train_status,
+                        fs,
                         main_program=None,
-                        fs=LocalFS(),
                         local_cache_path=".cache",
                         ignore_empty=True):
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index d2c7397c85f8df155444d9272c7b75596f0fe169..1a7a82fbfac19b41e8b96c231ca74398f6b2214c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -38,6 +38,7 @@ from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
 from paddle.fluid.incubate.fleet.parameter_server import version
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.public import _has_global_step
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import TrainerRuntimeConfig, DistributedStrategy, \
     SyncStrategy, AsyncStrategy, HalfAsyncStrategy, GeoStrategy, StrategyFactory
 
@@ -161,9 +162,9 @@ class FleetTranspiler(Fleet):
 
         print(trainer_config)
 
-        lrs = _get_lr_ops(self._origin_main_program)
+        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))
 
-        if len(lrs) > 0:
+        if lrs > 0:
             kwargs = {"need_global_step": "1"}
         else:
             kwargs = {"need_global_step": "0"}
@@ -186,14 +187,6 @@ class FleetTranspiler(Fleet):
             recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                 recv_type=1)
 
-        for name, ctx in send_ctx.items():
-            print("name: {}, ctx: {}".format(name, ctx))
-
-        print("==== = ==== =============== ====")
-
-        for name, ctx in recv_ctx.items():
-            print("name: {}, ctx: {}".format(name, ctx))
-
         from paddle.fluid.communicator import Communicator
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
@@ -393,6 +386,12 @@ class FleetTranspiler(Fleet):
                 "in fleet.save_inference_model() function, executor must be as Executor type"
             )
 
+        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
+        if not isinstance(executor.place, fluid.CPUPlace):
+            save_executor = Executor(fluid.CPUPlace())
+        else:
+            save_executor = executor
+
         if main_program is not None:
             if isinstance(main_program, CompiledProgram):
                 raise TypeError(
@@ -670,6 +669,11 @@ if you would like to save all variables in a
             raise TypeError(
                 "in fleet.save_persistables() function, executor must be as Executor type"
             )
+        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
+        if not isinstance(executor.place, fluid.CPUPlace):
+            save_executor = Executor(fluid.CPUPlace())
+        else:
+            save_executor = executor
 
         if main_program is None:
             main_program = self.main_program
@@ -679,7 +683,8 @@ if you would like to save all variables in a
                 "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
             )
 
-        self._save_distributed_persistables(executor, dirname, main_program)
+        self._save_distributed_persistables(save_executor, dirname,
+                                            main_program)
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index b96eff19e9b9c5d8e78b85e61b9a69afee106546..f9889997d9e38c98c4a736a62dbc72da7029f337 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -43,6 +43,8 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
+LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@"
+
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
@@ -62,6 +64,17 @@ def _get_lr_ops(program):
     return lr_ops
 
 
+def _has_global_step(lr_ops):
+    if len(lr_ops) > 0:
+        for idx, op in enumerate(lr_ops):
+            if op.type != 'increment':
+                continue
+            counter = op.input("X")[0]
+            if counter == LEARNING_RATE_DECAY_COUNTER:
+                return True
+    return False
+
+
 def is_sparse_op(op):
     if op.type == "lookup_table" and op.attr('is_sparse') is True and op.attr(
             'is_distributed') is False:
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 347927509e6d539555bd4d1b7a594febbc68f57b..15a3022f932f4a702bf7f94ed936468b6a06e94e 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -17,10 +17,12 @@ import warnings
 from .framework import Variable, in_dygraph_mode
 from .layer_helper import LayerHelper
 from .data_feeder import check_variable_and_dtype, check_dtype
+from ..utils import deprecated
 
 __all__ = ['one_hot', 'embedding']
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
     :alias_main: paddle.nn.functional.one_hot
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 201cc61e4d479dc11b169e02481ac4ff4780c2b8..ef469377acfbc0c2c521de61f8eacc0f7c9f0854 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import paddle
 from .framework import Program, program_guard, unique_name, cuda_places, cpu_places
 from .param_attr import ParamAttr
 from .initializer import Constant
@@ -44,10 +45,23 @@ class SimpleLayer(Layer):
 
 
 def run_check():
-    ''' install check to verify if install is success
-
+    """To check whether install is successful
     This func should not be called only if you need to verify installation
-    '''
+
+    Examples:
+        .. code-block: python
+
+            import paddle.fluid as fluid
+            fluid.install_check.run_check()
+
+            # If installed successfully, output may be
+            # Running Verify Fluid Program ... 
+            # W0805 04:24:59.496919 35357 device_context.cc:268] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.2, Runtime API Version: 10.1
+            # W0805 04:24:59.505594 35357 device_context.cc:276] device: 0, cuDNN Version: 7.6.
+            # Your Paddle Fluid works well on SINGLE GPU or CPU.
+            # Your Paddle Fluid works well on MUTIPLE GPU or CPU.
+            # Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now
+    """
     print("Running Verify Fluid Program ... ")
 
     device_list = []
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index ffe8939cd7a39cd7835fd9d0ab74dd66d4f24981..6e5f7fd035acfeab975f63b0794829d57f9bb239 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1669,9 +1669,6 @@ def _load_persistable_nodes(executor, dirname, graph):
 def save(program, model_path):
     """
     :api_attr: Static Graph
-	:alias_main: paddle.save
-	:alias: paddle.save,paddle.tensor.save,paddle.tensor.io.save
-	:old_api: paddle.fluid.save
 
     This function save parameters, optimizer information and network description to  model_path.
 
@@ -1733,9 +1730,6 @@ def save(program, model_path):
 def load(program, model_path, executor=None, var_list=None):
     """
     :api_attr: Static Graph
-	:alias_main: paddle.load
-	:alias: paddle.load,paddle.tensor.load,paddle.tensor.io.load
-	:old_api: paddle.fluid.io.load
 
     This function get parameters and optimizer information from program, and then get corresponding value from file.
     An exception will throw if shape or dtype of the parameters is not match.
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 0b57b3fefd414c483c537957ed6ca3cfdd58fa65..6e38c855562809fa38cddbf6e58eb4eee6b899f3 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -23,8 +23,13 @@ from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from .initializer import _global_weight_initializer, _global_bias_initializer
 
+__all__ = ['LayerHelperBase']
+
 
 class LayerHelperBase(object):
+    # global dtype
+    __dtype = "float32"
+
     def __init__(self, name, layer_type):
         self._layer_type = layer_type
         self._name = name
@@ -45,6 +50,14 @@ class LayerHelperBase(object):
     def startup_program(self):
         return default_startup_program()
 
+    @classmethod
+    def set_default_dtype(cls, dtype):
+        cls.__dtype = dtype
+
+    @classmethod
+    def get_default_dtype(cls):
+        return cls.__dtype
+
     def to_variable(self, value, name=None):
         """
         The API will create a ``Variable`` object from numpy\.ndarray or Variable object.
@@ -277,7 +290,7 @@ class LayerHelperBase(object):
     def create_parameter(self,
                          attr,
                          shape,
-                         dtype,
+                         dtype=None,
                          is_bias=False,
                          default_initializer=None,
                          stop_gradient=False,
@@ -299,6 +312,9 @@ class LayerHelperBase(object):
         if not attr:
             return None
         assert isinstance(attr, ParamAttr)
+        # set global dtype
+        if not dtype:
+            dtype = self.__dtype
         if is_bias:
             suffix = 'b'
             default_initializer = _global_bias_initializer(
@@ -372,6 +388,9 @@ class LayerHelperBase(object):
             based on operator's `VarTypeInference` implementation in
             infer_var_type.
         """
+        # set global dtype
+        if not dtype:
+            dtype = self.__dtype
         return self.main_program.current_block().create_var(
             name=unique_name.generate_with_ignorable_key(".".join(
                 [self.name, 'tmp'])),
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index b179d00626249849f64f0fc571cb2e85cf08ea05..2002b8a95decfd6d6c55538e2dff0a793828dd9b 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1511,7 +1511,7 @@ def array_write(x, i, array=None):
         assert i.shape == [
             1
         ], "The shape of index 'i' should be [1] in dygraph mode"
-        i = i.numpy()[0]
+        i = i.numpy().item(0)
         if array is None:
             array = create_array(x.dtype)
         assert isinstance(
@@ -1976,7 +1976,7 @@ def array_read(array, i):
         assert i.shape == [
             1
         ], "The shape of index 'i' should be [1] in dygraph mode"
-        i = i.numpy()[0]
+        i = i.numpy().item(0)
         return array[i]
 
     check_variable_and_dtype(i, 'i', ['int64'], 'array_read')
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 4217a98798ebbb46cb5b84e4c15fea4b4f0840ac..f468815c99ea2751913c5535c721ee9a6a5c5052 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import numpy as np
 from functools import partial, reduce
+from paddle.utils import deprecated
 from . import nn
 from .layer_function_generator import templatedoc
 from ..layer_helper import LayerHelper
@@ -1619,6 +1620,7 @@ def huber_loss(input, label, delta):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.kl_div")
 @templatedoc()
 def kldiv_loss(x, target, reduction='mean', name=None):
     """
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index fd1e7f800b928cfcecb9e09877f08c42c81defa6..38fc34472c8bc64338e2468bdf3f4b0bab1370ce 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import warnings
 import inspect
+import paddle
 
 from .. import core
 from ..framework import Variable, unique_name
@@ -45,6 +46,7 @@ EXPRESSION_MAP = {
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
     "__floordiv__": "A //B",
+    "__rfloordiv__": "A //= B",
     "__mod__": "A % B",
     "__eq__": "A == B",
     "__ne__": "A != B",
@@ -54,6 +56,31 @@ EXPRESSION_MAP = {
     "__ge__": "A >= B"
 }
 
+# method for Tensor from paddle.tensor
+# edit it when paddle.tensor has new method about Tensor operation
+common_methods = [
+    'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
+    'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
+    'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
+    'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
+    'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
+    'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
+    'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
+    'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
+    'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
+    'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
+    'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
+    'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
+    'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
+    'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
+    'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
+    'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
+    'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
+    'elementwise_sub'
+]
+
+_already_patch_variable = False
+
 
 def monkey_patch_variable():
     def unique_tmp_name():
@@ -179,7 +206,7 @@ def monkey_patch_variable():
                    "out_dtype": out.dtype})
         return out
 
-    def _scalar_elementwise_op_(var, scale, bias):
+    def _scalar_op_(var, scale, bias):
         block = current_block(var)
         out = create_new_tmp_var(block, var.dtype)
         block.append_op(
@@ -191,27 +218,46 @@ def monkey_patch_variable():
         return out
 
     def _neg_(var):
-        return _scalar_elementwise_op_(var, -1.0, 0.0)
+        return _scalar_op_(var, -1.0, 0.0)
+
+    def _scalar_add_(var, value):
+        return _scalar_op_(var, 1.0, value)
 
-    def _scalar_elementwise_add_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, value)
+    def _scalar_sub_(var, value):
+        return _scalar_op_(var, 1.0, -value)
 
-    def _scalar_elementwise_sub_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, -value)
+    def _scalar_rsub_(var, value):
+        return _scalar_op_(var, -1.0, value)
 
-    def _scalar_elementwise_rsub_(var, value):
-        return _scalar_elementwise_op_(var, -1.0, value)
+    def _scalar_mul_(var, value):
+        return _scalar_op_(var, value, 0.0)
 
-    def _scalar_elementwise_mul_(var, value):
-        return _scalar_elementwise_op_(var, value, 0.0)
+    def _scalar_div_(var, value):
+        return _scalar_op_(var, 1.0 / value, 0.0)
 
-    def _scalar_elementwise_div_(var, value):
-        return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
+    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
+    # for binary operator by using the api to achieve the type promotion
+    def _binary_method_creator_(op_type, reverse=False):
+        import paddle
+
+        def __impl__(self, other_var):
+            op = getattr(paddle, op_type)
+            if reverse:
+                return op(other_var, self)
+            else:
+                return op(self, other_var)
+
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
 
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
+    def _binary_creator_(method_name,
+                         op_type,
+                         reverse=False,
+                         scalar_method=None):
         def __impl__(self, other_var):
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
@@ -296,35 +342,56 @@ def monkey_patch_variable():
         __impl__.__name__ = method_name
         return __impl__
 
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-        setattr(Variable, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    # b = -a
-    Variable.__neg__ = _neg_
-    Variable.astype = astype
+    variable_methods = [
+        #   b=-a
+        ('__neg__', _neg_),
+        ('astype', astype),
+        ('__add__', _binary_creator_('__add__', 'elementwise_add', False,
+                                     _scalar_add_)),
+        #  a+b == b+a. Do not need to reverse explicitly
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+                                     _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+                                      _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+                                     _scalar_mul_)),
+        #  a*b == b*a. Do not need to reverse explicitly
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        # These binary use paddle.optype
+        ('__div__', _binary_method_creator_('divide', False)),
+        ('__rdiv__', _binary_method_creator_('divide', True)),
+        ('__truediv__', _binary_method_creator_('divide', False)),
+        ('__rtruediv__', _binary_method_creator_('divide', True)),
+        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
+        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
+        ('__mod__', _binary_method_creator_('remainder', False)),
+        #  for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None))
+    ]
+
+    global _already_patch_variable
+    if not _already_patch_variable:
+        for method in variable_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(Variable, method_name, method_impl)
+    else:
+        import paddle.tensor
+        for method_name in common_methods:
+            if hasattr(Variable, method_name): continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl: setattr(Variable, method_name, method_impl)
+
+    _already_patch_variable = True
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
old mode 100644
new mode 100755
index ae42b3bbdf0ada9ab19dc1ad6cc2f1e09def214d..ec71e4c9912295ca0844ce91dd2e06e03d9a216d
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ import six
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -35,6 +35,7 @@ from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
+from ...utils import deprecated
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
 from paddle.utils import deprecated
@@ -931,6 +932,7 @@ def cos_sim(X, Y):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout")
 def dropout(x,
             dropout_prob,
             is_test=False,
@@ -938,9 +940,6 @@ def dropout(x,
             name=None,
             dropout_implementation="downgrade_in_infer"):
     """
-    :alias_main: paddle.nn.functional.dropout
-	:alias: paddle.nn.functional.dropout,paddle.nn.functional.common.dropout
-	:old_api: paddle.fluid.layers.dropout
 
     Computes dropout.
 
@@ -1188,12 +1187,9 @@ def chunk_eval(input,
             num_correct_chunks)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
 def softmax(input, use_cudnn=False, name=None, axis=-1):
     """
-    :alias_main: paddle.nn.functional.softmax
-	:alias: paddle.nn.functional.softmax,paddle.nn.functional.activation.softmax
-	:old_api: paddle.fluid.layers.softmax
-
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
@@ -1307,8 +1303,8 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
     attrs = {"axis": axis, "use_cudnn": use_cudnn}
 
     helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'softmax')
+    check_variable_and_dtype(input, 'input/x',
+                             ['float16', 'float32', 'float64'], 'softmax')
 
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
@@ -3366,6 +3362,15 @@ def data_norm(input,
         "BatchSum": batch_sum,
         "BatchSquareSum": batch_square_sum
     }
+    attrs = {
+        "epsilon": epsilon,
+        "sync_stats": sync_stats,
+        "summary_decay_rate": summary_decay_rate,
+    }
+    if slot_dim > 0:
+        attrs["slot_dim"] = slot_dim
+    if enable_scale_and_shift:
+        attrs["enable_scale_and_shift"] = enable_scale_and_shift
     if enable_scale_and_shift:
         inputs["scale_w"] = scale_w
         inputs["bias"] = bias
@@ -3380,13 +3385,7 @@ def data_norm(input,
             "BatchSum": batch_sum,
             "BatchSquareSum": batch_square_sum
         },
-        attrs={
-            "epsilon": epsilon,
-            "slot_dim": slot_dim,
-            "sync_stats": sync_stats,
-            "summary_decay_rate": summary_decay_rate,
-            "enable_scale_and_shift": enable_scale_and_shift
-        })
+        attrs=attrs)
 
     return helper.append_activation(data_norm_out)
 
@@ -4401,12 +4400,9 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.mean")
 def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_mean
-	:alias: paddle.reduce_mean,paddle.tensor.reduce_mean,paddle.tensor.stat.reduce_mean
-	:old_api: paddle.fluid.layers.reduce_mean
-
     Computes the mean of the input tensor's elements along the given dimension.
 
     Args:
@@ -4455,31 +4451,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
     """
 
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-
-    if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] or len(dim) == len(
-            input.shape) else False
-        dim = dim if dim != None and dim != [] else [0]
-        return core.ops.reduce_mean(input, 'dim', dim, 'keep_dim', keep_dim,
-                                    'reduce_all', reduce_all)
-    attrs = {
-        'dim': dim if dim != None and dim != [] else [0],
-        'keep_dim': keep_dim,
-        'reduce_all': True
-        if dim == None or dim == [] or len(dim) == len(input.shape) else False
-    }
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_mean')
-    helper = LayerHelper('reduce_mean', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='reduce_mean',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs=attrs)
-    return out
+    return paddle.mean(x=input, axis=dim, keepdim=keep_dim, name=name)
 
 
 def reduce_max(input, dim=None, keep_dim=False, name=None):
@@ -4625,7 +4597,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
     Args:
         input (Variable): The input variable which is a Tensor, the data type is float32,
             float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the product is performed. If
+        dim (int|list|tuple, optional): The dimensions along which the product is performed. If
             :attr:`None`, multiply all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
@@ -4665,9 +4637,18 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+        if isinstance(dim, tuple):
+            dim = list(dim)
+        elif isinstance(dim, int):
+            dim = [dim]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".
+                format(type(dim)))
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod')
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_prod',
         inputs={'X': input},
@@ -4872,7 +4853,7 @@ def split(input, num_or_sections, dim=-1, name=None):
 
         if isinstance(dim, Variable):
             dim = dim.numpy()
-            dim = dim[0]
+            dim = dim.item(0)
         dim = (len(input.shape) + dim) if dim < 0 else dim
         attrs += ('axis', dim)
 
@@ -4897,7 +4878,7 @@ def split(input, num_or_sections, dim=-1, name=None):
 
     check_variable_and_dtype(
         input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'in64'], 'split')
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'split')
     check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
     check_type(dim, 'dim', (int, Variable), 'split')
     if isinstance(dim, Variable):
@@ -5055,6 +5036,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.matmul")
 def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     Applies matrix multiplication to two tensors.
@@ -5126,7 +5108,65 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
             y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
             out = fluid.layers.matmul(x, y, True, True)
     """
-    return paddle.matmul(x, y, transpose_x, transpose_y, alpha, name)
+    attrs = {
+        'transpose_X': transpose_x,
+        'transpose_Y': transpose_y,
+        'alpha': float(alpha),
+    }
+
+    if in_dygraph_mode():
+        out = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
+                        transpose_y, 'alpha', float(alpha))
+        return out
+
+    def __check_input(x, y):
+        var_names = {'x': x, 'y': y}
+        for name, val in var_names.items():
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
+        x_shape = list(x.shape)
+        y_shape = list(y.shape)
+        if len(x_shape) == 1:
+            x_shape = [1] + x_shape
+        if len(y_shape) == 1:
+            y_shape = y_shape + [1]
+
+        # check the inner 2 dimensions
+        if transpose_x:
+            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
+        if transpose_y:
+            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
+        if x_shape[-1] != y_shape[-2]:
+            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
+                "After performing an optional transpose, Input X's width should be "   \
+                "equal to Y's width for multiplication "                               \
+                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
+                (x_shape, y_shape)
+
+        if len(y_shape) > 2 and len(x_shape) > 2:
+            for i, dim_x in enumerate(x_shape[:-2]):
+                # don't check neg shape
+                if dim_x < 0 or y_shape[i] < 0:
+                    continue
+                if dim_x != y_shape[i]:
+                    raise ValueError(
+                        "When the matrix is larger than 2 dimensions, the higher "
+                        "dimensional values of the two matrices need to be equal. "
+                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
+                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+
+    __check_input(x, y)
+
+    helper = LayerHelper('matmul', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs=attrs)
+    return out
 
 
 def topk(input, k, name=None):
@@ -5831,6 +5871,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     return loss
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
 
@@ -5916,7 +5957,7 @@ def one_hot(input, depth, allow_out_of_range=False):
             depth = depth.numpy()
             assert depth.shape == (
                 1, ), "depth of type Variable should have shape [1]"
-            depth = depth[0]
+            depth = depth.item(0)
         out = core.ops.one_hot(input, 'depth', depth, 'allow_out_of_range',
                                allow_out_of_range)
         out.stop_gradient = True
@@ -5994,7 +6035,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     :alias_main: paddle.reshape
 	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
-	:old_api: paddle.fluid.layers.reshape
 
     This operator changes the shape of ``x`` without changing its data.
 
@@ -6037,14 +6077,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         The parameter ``actual_shape`` will be deprecated in the future and only use ``shape`` instead to represent the target shape.
 
     Args:
-        x(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
-        shape(list|tuple|Variable): Define the target shape. At most one dimension of the target shape can be -1.
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
                         The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
-                        If ``shape`` is an Variable, it should be an 1-D Tensor .
+                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
         actual_shape(variable, optional): An 1-D ``Tensor`` or ``LoDTensor`` . The data type is ``int32`` . If provided, reshape
                                 according to this given shape rather than ``shape`` specifying shape.
                                 That is to say ``actual_shape`` has a higher priority
-                                than ``shape(list|tuple)`` but not ``shape(Variable)``. \
+                                than ``shape(list|tuple)`` but not ``shape(Tensor)``. \
                                 This argument ``actual_shape`` will be removed in a future version. \
                                 Instructions for updating: ``actual_shape`` will be removed in future versions and replaced by ``shape``.
         act (str, optional): The non-linear activation to be applied to the reshaped input. Default None.
@@ -6056,10 +6096,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
+        Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
 
     Raises:
-        TypeError: If actual_shape is neither Variable nor None.
+        TypeError: If actual_shape is neither Tensor nor None.
         ValueError: If more than one elements of ``shape`` is -1.
         ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
         ValueError: If the elements in ``shape`` is negative except -1.
@@ -6070,7 +6110,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             import paddle.fluid as fluid
 
             # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
+            # attr shape is a list which doesn't contain Tensors.
             data_1 = fluid.data(
               name='data_1', shape=[2, 4, 6], dtype='float32')
             reshaped_1 = fluid.layers.reshape(
@@ -6078,7 +6118,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             # the shape of reshaped_1 is [2,4,3,2].
 
             # example 2:
-            # attr shape is a list which contains tensor Variable.
+            # attr shape is a list which contains Tensors.
             data_2 = fluid.layers.fill_constant([2,25], "int32", 3)
             dim = fluid.layers.fill_constant([1], "int32", 5)
             reshaped_2 = fluid.layers.reshape(data_2, shape=[dim, 10])
@@ -6098,7 +6138,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             )
         if isinstance(shape, (list, tuple)):
             shape = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
             out, _ = core.ops.reshape2(x, 'shape', shape)
@@ -8170,9 +8210,9 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
+@deprecated(since="2.0.0", update_to="paddle.gather")
 def gather(input, index, overwrite=True):
     """
-    **Gather Layer**
 
     Output is obtained by gathering entries of the outer-most dimension
     of X indexed by `index` and concatenate them together.
@@ -8199,19 +8239,21 @@ def gather(input, index, overwrite=True):
                        [5, 6]]
 
     Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is
+        input (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
             float16 (only for GPU).
-        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
+        index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
         overwrite (bool, optional): The mode that updating the grad when has same index.
             If True, use the overwrite mode to update the grad of the same index,
 	    if False, use the accumulate mode to update the grad of the same index.
 	    Default value is True.
 
-
-
     Returns:
-        output (Variable): The output is a tensor with the same rank as input.
+        output (Tensor): The output is a tensor with the same rank as input.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
 
     Examples:
 
@@ -8222,6 +8264,13 @@ def gather(input, index, overwrite=True):
             index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
             output = fluid.layers.gather(x, index)
     """
+    if in_dygraph_mode():
+        return core.ops.gather(input, index, None)
+
+    check_variable_and_dtype(
+        input, 'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], 'gather')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -8234,6 +8283,7 @@ def gather(input, index, overwrite=True):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.gather_nd")
 def gather_nd(input, index, name=None):
     """
     **Gather Nd Layer**
@@ -8286,14 +8336,18 @@ def gather_nd(input, index, name=None):
                          = [23]
 
     Args:
-        input (Variable): The source input. Its dtype should be int32, int64, float32, float64.
-        index (Variable): The index input with rank > 1, index.shape[-1] <= input.rank.
-                          Its dtype should be int32, int64.
-        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically.
+        input (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
+        index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
+                        Its dtype should be int32, int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        output (Variable): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+        output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+    
+    Raises:
+        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of float32, float64, int32 and int64.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
 
     Examples:
 
@@ -8305,6 +8359,12 @@ def gather_nd(input, index, name=None):
             output = fluid.layers.gather_nd(x, index)
 
     """
+    if in_dygraph_mode():
+        return core.ops.gather_nd(input, index)
+    check_variable_and_dtype(input, 'input',
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'gather_np')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
@@ -8316,6 +8376,7 @@ def gather_nd(input, index, name=None):
     return output
 
 
+@deprecated(since="2.0.0", update_to="paddle.scatter")
 def scatter(input, index, updates, name=None, overwrite=True):
     """
     :alias_main: paddle.scatter
@@ -8631,7 +8692,7 @@ def log(x, name=None):
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu")
 def relu(x, name=None):
     """
     ${comment}
@@ -8673,11 +8734,9 @@ def relu(x, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.selu")
 def selu(x, scale=None, alpha=None, name=None):
     """
-    :alias_main: paddle.nn.functional.selu
-	:alias: paddle.nn.functional.selu,paddle.nn.functional.activation.selu
-	:old_api: paddle.fluid.layers.selu
 
     Selu Operator.
 
@@ -9292,7 +9351,7 @@ def pad2d(input,
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.elu")
 def elu(x, alpha=1.0, name=None):
     """
     :alias_main: paddle.nn.functional.elu
@@ -9334,12 +9393,9 @@ def elu(x, alpha=1.0, name=None):
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu6")
 def relu6(x, threshold=6.0, name=None):
     """
-    :alias_main: paddle.nn.functional.relu6
-	:alias: paddle.nn.functional.relu6,paddle.nn.functional.activation.relu6
-	:old_api: paddle.fluid.layers.relu6
 
     ${comment}
 
@@ -9375,7 +9431,10 @@ def relu6(x, threshold=6.0, name=None):
         type='relu6',
         inputs={'X': x},
         outputs={'Out': out},
-        attrs={'threshold': threshold})
+        attrs={
+            'threshold': threshold,
+            'use_mkldnn': core.globals()["FLAGS_use_mkldnn"]
+        })
     return out
 
 
@@ -9608,6 +9667,7 @@ def swish(x, beta=1.0, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.prelu")
 def prelu(x, mode, param_attr=None, name=None):
     """
     :api_attr: Static Graph
@@ -9667,7 +9727,8 @@ def prelu(x, mode, param_attr=None, name=None):
         ) >= 2, "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
         #NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
         # To be consistent with Prelu, it is simplified.
-        alpha_shape = [1, x.shape[1]]
+        #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+        alpha_shape = [1, x.shape[1], 1, 1]
     elif mode == 'element':
         assert len(
             x.shape
@@ -9735,13 +9796,10 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.leaky_relu")
 @templatedoc()
 def leaky_relu(x, alpha=0.02, name=None):
     """
-    :alias_main: paddle.nn.functional.leaky_relu
-	:alias: paddle.nn.functional.leaky_relu,paddle.nn.functional.activation.leaky_relu
-	:old_api: paddle.fluid.layers.leaky_relu
-
     ${comment}
     Args:
         x(${x_type}): ${x_comment}
@@ -9770,19 +9828,7 @@ def leaky_relu(x, alpha=0.02, name=None):
             res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
             print(res_val) # [[-0.1, 2], [3, -0.4]]
     """
-    if in_dygraph_mode():
-        return core.ops.leaky_relu(x, 'alpha', alpha)
-
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'leaky_relu')
-
-    inputs = {'X': [x]}
-    attrs = {'alpha': alpha}
-    helper = LayerHelper('leaky_relu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='leaky_relu', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
+    return paddle.nn.functional.leaky_relu(x, alpha, name)
 
 
 def soft_relu(x, threshold=40.0, name=None):
@@ -10121,12 +10167,12 @@ def unstack(x, axis=0, num=None):
     raised.
 
     Args:
-        x (Variable): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64.
+        x (Tensor): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64.
         axis (int): The axis along which the input is unstacked.
         num (int|None): The number of output variables.
 
     Returns:
-        list(Variable): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
+        list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
 
     Raises:
         ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D).
@@ -10135,7 +10181,7 @@ def unstack(x, axis=0, num=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
             y = fluid.layers.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
@@ -10222,7 +10268,7 @@ def expand(x, expand_times, name=None):
     if in_dygraph_mode():
         if isinstance(expand_times, (list, tuple)):
             expand_times = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in expand_times
             ]
 
@@ -10347,6 +10393,7 @@ def expand_as(x, target_tensor, name=None):
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 
+@deprecated(since='1.8.0', update_to="paddle.uniform")
 @templatedoc()
 def uniform_random_batch_size_like(input,
                                    shape,
@@ -10442,6 +10489,7 @@ def uniform_random_batch_size_like(input,
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.normal")
 @templatedoc()
 def gaussian_random(shape,
                     mean=0.0,
@@ -10616,6 +10664,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     return out
 
 
+@deprecated(since='1.8.0', update_to="paddle.normal")
 @templatedoc()
 def gaussian_random_batch_size_like(input,
                                     shape,
@@ -10833,11 +10882,11 @@ def slice(input, axes, starts, ends):
         if isinstance(starts, (list, tuple)) and isinstance(ends,
                                                             (list, tuple)):
             starts = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in starts
             ]
             ends = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in ends
             ]
 
@@ -11204,6 +11253,7 @@ def rank(input):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.numel")
 def size(input):
     """
     **Size Layer**
@@ -11211,11 +11261,14 @@ def size(input):
     Returns the number of elements for a tensor, which is a int64 Tensor with shape [1].
 
     Args:
-        input (Variable): The input variable.
+        input (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
 
     Returns:
-        Variable: The number of elements for the input variable.
+        Tensor: The number of elements for the input Tensor.
 
+    Raises:
+        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
+    
     Examples:
         .. code-block:: python
 
@@ -11226,6 +11279,11 @@ def size(input):
             rank = layers.size(input) # 300
     """
 
+    if in_dygraph_mode():
+        return core.ops.size(x)
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        "size")
     helper = LayerHelper('size', **locals())
     out = helper.create_variable_for_type_inference(dtype='int64')
     helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
@@ -11441,11 +11499,17 @@ Examples:
     """
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_add')
+            x,
+            y,
+            axis=axis,
+            act=act,
+            op_name='elementwise_add',
+            use_mkldnn=core.globals()["FLAGS_use_mkldnn"])
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.divide")
 def elementwise_div(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_div
@@ -11869,6 +11933,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.remainder")
 def elementwise_mod(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_mod
@@ -11906,6 +11971,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.floor_divide")
 def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_floordiv
@@ -12020,6 +12086,13 @@ Examples:
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
+    if in_dygraph_mode():
+        op = getattr(core.ops, op_name)
+        if binary_op:
+            return op(x, y)
+        else:
+            return op(x)
+
     check_variable_and_dtype(x, "x", ["bool"], op_name)
     if y is not None:
         check_variable_and_dtype(y, "y", ["bool"], op_name)
@@ -12044,28 +12117,27 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     return out
 
 
-@templatedoc()
 def logical_and(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_and
-    :alias: paddle.logical_and, paddle.tensor.logical_and, paddle.tensor.logic.logical_and
-    :old_api: paddle.fluid.layers.logical_and
 
-    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = x \&\& y
 
+    .. note::
+        ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12073,41 +12145,39 @@ def logical_and(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
+            paddle.disable_static()
+            x_data = np.array([True], dtype=np.bool)
             y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
-            y = paddle.imperative.to_variable(y_data)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_and(x, y)
-            print(res.numpy()) # [True False False False]
+            print(res.numpy()) # [True False True False]
     """
-
     return _logical_op(
         op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
 
 
-@templatedoc()
 def logical_or(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_or
-    :alias: paddle.logical_or, paddle.tensor.logical_or, paddle.tensor.logic.logical_or
-    :old_api: paddle.fluid.layers.logical_or
 
-    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = x || y
 
+    .. note::
+        ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+    
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Variable`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12115,41 +12185,39 @@ def logical_or(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
-            y = paddle.imperative.to_variable(y_data)
+            paddle.disable_static()
+            x_data = np.array([True, False], dtype=np.bool).reshape(2, 1)
+            y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_or(x, y)
-            print(res.numpy()) # [True  True  True False]
+            print(res.numpy()) # [[ True  True] [ True False]]
     """
-
     return _logical_op(
         op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
 
 
-@templatedoc()
 def logical_xor(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_xor
-    :alias: paddle.logical_xor, paddle.tensor.logical_xor, paddle.tensor.logic.logical_xor
-    :old_api: paddle.fluid.layers.logical_xor
 
-    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = (x || y) \&\& !(x \&\& y)
 
+    .. note::
+        ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12157,15 +12225,14 @@ def logical_xor(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
-            y = paddle.imperative.to_variable(y_data)
+            paddle.disable_static()
+            x_data = np.array([True, False], dtype=np.bool).reshape([2, 1])
+            y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2])
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_xor(x, y)
-            print(res.numpy()) # [False  True  True False]
+            print(res.numpy()) # [[False,  True], [ True, False]]
     """
-
     return _logical_op(
         op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
 
@@ -12197,9 +12264,9 @@ def logical_not(x, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             x_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
+            x = paddle.to_variable(x_data)
             res = paddle.logical_not(x)
             print(res.numpy()) # [False  True False  True]
     """
@@ -12211,8 +12278,6 @@ def logical_not(x, out=None, name=None):
 @templatedoc()
 def clip(x, min, max, name=None):
     """
-    :alias_main: paddle.nn.clip
-	:alias: paddle.nn.clip,paddle.nn.clip.clip
 	:old_api: paddle.fluid.layers.clip
 
     ${comment}
@@ -12307,13 +12372,10 @@ def clip_by_norm(x, max_norm, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.mean")
 @templatedoc()
 def mean(x, name=None):
     """
-    :alias_main: paddle.mean
-	:alias: paddle.mean,paddle.tensor.mean,paddle.tensor.stat.mean
-	:old_api: paddle.fluid.layers.mean
-
     ${comment}
 
     Args:
@@ -12331,6 +12393,7 @@ def mean(x, name=None):
                 name='data', shape=[2, 3], dtype='float32')
             mean = fluid.layers.mean(input)
     """
+
     if in_dygraph_mode():
         return core.ops.mean(x)
 
@@ -13991,12 +14054,9 @@ def where(condition):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.sign")
 def sign(x):
     """
-    :alias_main: paddle.sign
-	:alias: paddle.sign,paddle.tensor.sign,paddle.tensor.math.sign
-	:old_api: paddle.fluid.layers.sign
-
     This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
@@ -14030,17 +14090,11 @@ def sign(x):
 
 def unique(x, dtype='int32'):
     """
-    :alias_main: paddle.unique
-	:alias: paddle.unique,paddle.tensor.unique,paddle.tensor.manipulation.unique
-	:old_api: paddle.fluid.layers.unique
-
-    **unique**
-
     Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
 
     Args:
-        x(Variable): A 1-D input tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of index tensor: int32, int64.
+        x(Tensor): A 1-D input tensor, it's data type should be float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The type of index tensor: int32, int64. Default: int32.
 
     Returns:
         tuple: (out, index). `out` is the unique tensor for `x`, with identical dtype to `x`, and \
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index c8f74c809a780099d60cafdc3dfb80d504a40105..84cacea6ba5723f8a06fc87fa9c59d96f802e65a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -18,14 +18,22 @@ from .layer_function_generator import generate_layer_fn, generate_activation_fn,
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from paddle.utils import deprecated
+
+__deprecated_func_name__ = {'tanh_shrink': 'tanhshrink', }
 
 __activations_noattr__ = [
     'sigmoid',
     'logsigmoid',
-    'exp',
+    'tanh_shrink',
+    'softplus',
+    'softsign',
     'tanh',
+]
+
+__unary_func__ = [
+    'exp',
     'atan',
-    'tanh_shrink',
     'sqrt',
     'rsqrt',
     'abs',
@@ -33,15 +41,13 @@ __activations_noattr__ = [
     'floor',
     'cos',
     'acos',
-    'asin',
     'sin',
     'sinh',
+    'asin',
     'cosh',
     'round',
     'reciprocal',
     'square',
-    'softplus',
-    'softsign',
 ]
 
 __all__ = []
@@ -57,9 +63,24 @@ globals()['_scale'] = generate_layer_fn('scale')
 globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 
 __all__ += __activations_noattr__
+__all__ += __unary_func__
 
 for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_activation_fn(_OP)
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    func = generate_activation_fn(_OP)
+    func = deprecated(
+        since="2.0.0", update_to="paddle.nn.functional.%s" % (_new_OP))(func)
+    globals()[_OP] = func
+
+for _OP in set(__unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    func = generate_activation_fn(_OP)
+    func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
+    globals()[_OP] = func
 
 add_sample_code(globals()["sigmoid"], r"""
 Examples:
@@ -68,10 +89,10 @@ Examples:
         import numpy as np
         import paddle
         import paddle.nn.functional as F
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = F.sigmoid(x)
         print(out.numpy())
         # [0.40131234 0.450166   0.52497919 0.57444252]
@@ -85,10 +106,10 @@ Examples:
         import numpy as np
         import paddle
         import paddle.nn.functional as F
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = F.logsigmoid(x)
         print(out.numpy())
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
@@ -101,10 +122,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.exp(x)
         print(out.numpy())
         # [0.67032005 0.81873075 1.10517092 1.34985881]
@@ -117,10 +138,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.tanh(x)
         print(out.numpy())
         # [-0.37994896 -0.19737532  0.09966799  0.29131261]
@@ -133,10 +154,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.atan(x)
         print(out.numpy())
         # [-0.38050638 -0.19739556  0.09966865  0.29145679]
@@ -147,16 +168,14 @@ add_sample_code(globals()["tanh_shrink"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
-        paddle.enable_imperative()
+        import numpy as np
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
-        out = F.tanh_shrink(x)
-        print(out.numpy())
-        # [-0.02005104 -0.00262468  0.00033201  0.00868739]
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
 """)
 
@@ -166,10 +185,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.sqrt(x)
         print(out.numpy())
         # [0.31622777 0.4472136  0.54772256 0.63245553]
@@ -182,10 +201,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.rsqrt(x)
         print(out.numpy())
         # [3.16227766 2.23606798 1.82574186 1.58113883]
@@ -198,10 +217,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.abs(x)
         print(out.numpy())
         # [0.4 0.2 0.1 0.3]
@@ -214,10 +233,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.ceil(x)
         print(out.numpy())
         # [-0. -0.  1.  1.]
@@ -230,10 +249,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.floor(x)
         print(out.numpy())
         # [-1. -1.  0.  0.]
@@ -246,10 +265,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.cos(x)
         print(out.numpy())
         # [0.92106099 0.98006658 0.99500417 0.95533649]
@@ -262,10 +281,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.acos(x)
         print(out.numpy())
         # [1.98231317 1.77215425 1.47062891 1.26610367]
@@ -278,10 +297,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.sin(x)
         print(out.numpy())
         # [-0.38941834 -0.19866933  0.09983342  0.29552021]
@@ -294,10 +313,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.asin(x)
         print(out.numpy())
         # [-0.41151685 -0.20135792  0.10016742  0.30469265]
@@ -310,10 +329,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.cosh(x)
         print(out.numpy())
         # [1.08107237 1.02006676 1.00500417 1.04533851]
@@ -326,10 +345,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.sinh(x)
         print(out.numpy())
         # [-0.41075233 -0.201336    0.10016675  0.30452029]
@@ -342,10 +361,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.5, -0.2, 0.6, 1.5])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.round(x)
         print(out.numpy())
         # [-1. -0.  1.  2.]
@@ -358,10 +377,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.reciprocal(x)
         print(out.numpy())
         # [-2.5        -5.         10.          3.33333333]
@@ -374,10 +393,10 @@ Examples:
 
         import numpy as np
         import paddle
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
+        x = paddle.to_variable(x_data)
         out = paddle.square(x)
         print(out.numpy())
         # [0.16 0.04 0.01 0.09]
@@ -388,16 +407,14 @@ add_sample_code(globals()["softplus"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
-        paddle.enable_imperative()
+        import numpy as np
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
-        out = F.softplus(x)
-        print(out.numpy())
-        # [0.51301525 0.59813887 0.74439666 0.85435524]
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
 
 """)
 
@@ -405,16 +422,14 @@ add_sample_code(globals()["softsign"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
-        paddle.enable_imperative()
+        import numpy as np
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.imperative.to_variable(x_data)
-        out = F.softsign(x)
-        print(out.numpy())
-        # [-0.28571429 -0.16666667  0.09090909  0.23076923]
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
 """)
 
@@ -473,6 +488,7 @@ __all__ += ['hard_shrink']
 _hard_shrink_ = generate_layer_fn('hard_shrink')
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.hardshrink")
 def hard_shrink(x, threshold=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_shrink')
@@ -486,10 +502,6 @@ def hard_shrink(x, threshold=None):
 
 
 hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
-	:alias_main: paddle.nn.functional.hard_shrink
-	:alias: paddle.nn.functional.hard_shrink,paddle.nn.functional.activation.hard_shrink
-	:old_api: paddle.fluid.layers.hard_shrink
-
 Examples:
 
     >>> import paddle.fluid as fluid
@@ -502,6 +514,10 @@ __all__ += ['cumsum']
 _cum_sum_ = generate_layer_fn('cumsum')
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.cumsum",
+    reason="New APIs for Paddle 2.0 are coming.")
 def cumsum(x, axis=None, exclusive=None, reverse=None):
     check_type(x, 'x', (Variable), 'cumsum')
     locals_var = locals().copy()
@@ -631,6 +647,7 @@ __all__ += ['gelu']
 _gelu_ = generate_layer_fn('gelu')
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.gelu")
 def gelu(x, approximate=False):
     locals_var = locals().copy()
     kwargs = dict()
@@ -641,10 +658,6 @@ def gelu(x, approximate=False):
 
 
 gelu.__doc__ = """
-	:alias_main: paddle.nn.functional.gelu
-	:alias: paddle.nn.functional.gelu,paddle.nn.functional.activation.gelu
-	:old_api: paddle.fluid.layers.gelu
-
 :strong:`GeLU Activation Operator`
 For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
 
@@ -719,7 +732,7 @@ __all__ += ['erf']
 _erf_ = generate_layer_fn('erf')
 
 
-def erf(x):
+def erf(x, name=None):
     locals_var = locals().copy()
     kwargs = dict()
     for name, val in locals_var.items():
@@ -729,10 +742,6 @@ def erf(x):
 
 
 erf.__doc__ = """
-	:alias_main: paddle.erf
-	:alias: paddle.erf,paddle.tensor.erf,paddle.tensor.math.erf,paddle.nn.functional.erf,paddle.nn.functional.activation.erf
-	:old_api: paddle.fluid.layers.erf
-
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
 
@@ -742,57 +751,22 @@ Equation:
 
 Args:
 
-    x(Variable): The input of Erf op, Tensor or LoDTensor, dtype: float32 or float64.
+    x (Tensor): The input tensor, it's data type should be float32, float64.
 
 Returns:
 
-    Variable: The output of Erf op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
+    Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
     
     .. code-block:: python
     
-        # declarative mode
-        import numpy as np
-        from paddle import fluid
-        
-        x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
-        y = fluid.layers.erf(x)
-        
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        start = fluid.default_startup_program()
-        main = fluid.default_main_program()
-        
-        data = np.random.randn(2, 3).astype("float32")
-        exe.run(start)
-        
-        y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        
-        data
-        # array([[ 0.4643714 , -1.1509596 ,  1.2538221 ],
-        #        [ 0.34369683,  0.27478245,  1.1805398 ]], dtype=float32)
-        y_np
-        # array([[ 0.48863927, -0.8964121 ,  0.9237998 ],
-        #        [ 0.37307587,  0.30242872,  0.9049887 ]], dtype=float32)
-
-    .. code-block:: python
-    
-        # imperative mode
         import numpy as np
-        from paddle import fluid
-        import paddle.fluid.dygraph as dg
-        
-        data = np.random.randn(2, 3).astype("float32")
-        place = fluid.CPUPlace()
-        with dg.guard(place) as g:
-            x = dg.to_variable(data)
-            y = fluid.layers.erf(x)
-            y_np = y.numpy()
-        data
-        # array([[ 0.4643714 , -1.1509596 ,  1.2538221 ],
-        #        [ 0.34369683,  0.27478245,  1.1805398 ]], dtype=float32)
-        y_np
-        # array([[ 0.48863927, -0.8964121 ,  0.9237998 ],
-        #        [ 0.37307587,  0.30242872,  0.9049887 ]], dtype=float32)
+        import paddle
+        paddle.disable_static()
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_tensor(x_data)
+        out = paddle.erf(x)
+        print(out.numpy())
+        # [-0.42839236 -0.22270259  0.11246292  0.32862676]
 """
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index ecc58768522831b55f620cb6dc911630e2c2ad68..fe8ed83923e88be2a0c98a8a539f26500b43b7cb 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -38,6 +38,7 @@ __all__ = [
     'Decoder',
     'BeamSearchDecoder',
     'rnn',
+    'birnn',
     'dynamic_decode',
     'DecodeHelper',
     'TrainingHelper',
@@ -127,7 +128,8 @@ class RNNCell(object):
         else:
             integer_types = (int, )
         check_variable_and_dtype(batch_ref, 'batch_ref',
-                                 ['float32', 'float64'], 'RNNCell')
+                                 ['float32', 'float64', 'int32', 'int64'],
+                                 'RNNCell')
         check_type(shape, 'shape', (list, tuple, type(None), integer_types),
                    'RNNCell')
         if isinstance(shape, (list, tuple)):
@@ -437,61 +439,146 @@ def rnn(cell,
         is_reverse=False,
         **kwargs):
     """
-	:api_attr: Static Graph
-
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` repeatedly until reaches to the maximum
-    length of `inputs`.
-
-    Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
-        inputs(Variable): A (possibly nested structure of) tensor variable[s]. 
-            The shape of tensor should be `[batch_size, sequence_length, ...]`
-            for `time_major == False` or `[sequence_length, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in RNN.
-        initial_states(Variable, optional): A (possibly nested structure of)
-            tensor variable[s], representing the initial state for RNN. 
-            If not provided, `cell.get_initial_states` would be used to produce
-            the initial state. Default None.
-        sequence_length(Variable, optional): A tensor with shape `[batch_size]`.
-            It stores real length of each instance, thus enables users to extract
-            the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the paddings would be treated same as
-            non-padding inputs. Default None.
-        time_major(bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        is_reverse(bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
+    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) 
+    repeatedly until reaches to the maximum length of `inputs`.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(Tensor|tuple|list, optional): the initial state of the 
+            rnn cell. Tensor or a possibly nested structure of tensors. If not 
+            provided, `cell.get_initial_states` would be called to produce
+            the initial state. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
 
     Returns:
-        tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-            outputs and states, both are Tensor or nested structure of Tensor. \
-            `final_outputs` has the same structure and data types as \
-            the returned `outputs` of :code:`cell.call` , and each Tenser in `final_outputs` \
-            stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-            for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-            `final_states` is the counterpart at last time step of initial states, \
-            thus has the same structure with it and has tensors with same shapes \
-            and data types.
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
+            structure of Tensors.
+            If `time_major` is True, the shape of each tensor in outpus is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
+            tensor[s], representing the final state for RNN. It has the same 
+            structure of intial state. Each tensor in final states has the same
+            shape and dtype as the corresponding tensor in initial states.
             
 
     Examples:
 
         .. code-block:: python
-            
-            import paddle.fluid as fluid
 
-            inputs = fluid.data(name="inputs",
-                                shape=[-1, 32, 128],
-                                dtype="float32")
-            cell = fluid.layers.GRUCell(hidden_size=128)
-            outputs = fluid.layers.rnn(cell=cell, inputs=inputs)
+            import paddle
+            paddle.disable_static()
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+            outputs, final_states = paddle.nn.functional.rnn(cell, inputs, prev_h) 
+
     """
+    if in_dygraph_mode():
+        return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length,
+                                  time_major, is_reverse, **kwargs)
+    else:
+        return _rnn_static_graph(cell, inputs, initial_states, sequence_length,
+                                 time_major, is_reverse, **kwargs)
+
+
+class ArrayWrapper(object):
+    def __init__(self, x):
+        self.array = [x]
+
+    def append(self, x):
+        self.array.append(x)
+        return self
+
+
+def _maybe_copy(state, new_state, step_mask):
+    """update rnn state or just pass the old state through"""
+    new_state = nn.elementwise_mul(new_state, step_mask, axis=0) \
+              + nn.elementwise_mul(state, (1 - step_mask), axis=0)
+    return new_state
+
+
+def _transpose_batch_time(x):
+    perm = [1, 0] + list(range(2, len(x.shape)))
+    return nn.transpose(x, perm)
+
+
+def _rnn_dynamic_graph(cell,
+                       inputs,
+                       initial_states=None,
+                       sequence_length=None,
+                       time_major=False,
+                       is_reverse=False,
+                       **kwargs):
+    time_step_index = 0 if time_major else 1
+    flat_inputs = flatten(inputs)
+    time_steps = flat_inputs[0].shape[time_step_index]
+
+    if not time_major:
+        inputs = map_structure(_transpose_batch_time, inputs)
+
+    if sequence_length is not None:
+        mask = sequence_lod.sequence_mask(
+            sequence_length, maxlen=time_steps, dtype=inputs.dtype)
+        mask = nn.transpose(mask, [1, 0])
+
+    if is_reverse:
+        inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs)
+        mask = tensor.reverse(mask, axis=[0]) \
+            if sequence_length is not None else None
+
+    states = initial_states
+    outputs = []
+    for i in range(time_steps):
+        step_inputs = map_structure(lambda x: x[i], inputs)
+        step_outputs, new_states = cell(step_inputs, states, **kwargs)
+        if sequence_length is not None:
+            new_states = map_structure(
+                partial(
+                    _maybe_copy, step_mask=mask[i]), states, new_states)
+        states = new_states
+        outputs = map_structure(lambda x: ArrayWrapper(x),
+                                step_outputs) if i == 0 else map_structure(
+                                    lambda x, x_array: x_array.append(x),
+                                    step_outputs, outputs)
+
+    final_outputs = map_structure(
+        lambda x: nn.stack(x.array, axis=time_step_index),
+        outputs)
+
+    if is_reverse:
+        final_outputs = map_structure(
+            lambda x: tensor.reverse(x, axis=time_step_index),
+            final_outputs)
+
+    final_states = new_states
+    return final_outputs, final_states
+
+
+def _rnn_static_graph(cell,
+                      inputs,
+                      initial_states=None,
+                      sequence_length=None,
+                      time_major=False,
+                      is_reverse=False,
+                      **kwargs):
     check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
     if isinstance(inputs, (list, tuple)):
         for i, input_x in enumerate(inputs):
@@ -499,30 +586,10 @@ def rnn(cell,
                                      ['float32', 'float64'], 'rnn')
     check_type(initial_states, 'initial_states',
                (Variable, list, tuple, type(None)), 'rnn')
-    if isinstance(initial_states, (list, tuple)):
-        states = map_structure(lambda x: x, initial_states)[0]
-        for i, state in enumerate(states):
-            if isinstance(state, (list, tuple)):
-                for j, state_j in enumerate(state):
-                    check_variable_and_dtype(state_j, 'state_j[' + str(j) + ']',
-                                             ['float32', 'float64'], 'rnn')
-            else:
-                check_variable_and_dtype(state, 'states[' + str(i) + ']',
-                                         ['float32', 'float64'], 'rnn')
 
     check_type(sequence_length, 'sequence_length', (Variable, type(None)),
                'rnn')
 
-    def _maybe_copy(state, new_state, step_mask):
-        # TODO: use where_op
-        new_state = nn.elementwise_mul(
-            new_state, step_mask, axis=0) - nn.elementwise_mul(
-                state, (step_mask - 1), axis=0)
-        return new_state
-
-    def _transpose_batch_time(x):
-        return nn.transpose(x, [1, 0] + list(range(2, len(x.shape))))
-
     def _switch_grad(x, stop=False):
         x.stop_gradient = stop
         return x
@@ -581,6 +648,98 @@ def rnn(cell,
     return (final_outputs, final_states)
 
 
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states=None,
+          sequence_length=None,
+          time_major=False,
+          **kwargs):
+    """
+    birnn creates a bidirectional recurrent neural network specified by 
+    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
+    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
+    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    along the last axis.
+
+    Arguments:
+        cell_fw(RNNCellBase): An instance of `RNNCellBase`.
+        cell_bw(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(tuple, optional): A tuple of initial states of 
+            `cell_fw` and `cell_bw`.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce initial state for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of each cell. 
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell.        
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
+            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
+            initial_states = ((hf, cf), (hb, cb))
+            outputs, final_states = paddle.nn.functional.birnn(
+                cell_fw, cell_bw, inputs, initial_states)
+        
+    """
+    if initial_states is None:
+        states_fw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+        states_bw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+    else:
+        states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major,
+                                **kwargs)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True,
+                                **kwargs)
+
+    outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw,
+                            outputs_bw)
+
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
 class Decoder(object):
     """
 	:api_attr: Static Graph
@@ -2212,9 +2371,9 @@ def lstm(input,
         input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64
         init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
                        If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
+        max_len (int): This parameter has no effect and will be discarded.
         init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
                        If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len.
         hidden_size (int): hidden size of the LSTM.
         num_layers (int): total layers number of the LSTM.
         dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -2255,7 +2414,6 @@ def lstm(input,
             data = fluid.data(name='x', shape=[None, 100], dtype='int64')
             emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
             batch_size = 20
-            max_len = 100
             dropout_prob = 0.2
             input_size = 100
             hidden_size = 150
@@ -2308,9 +2466,11 @@ def lstm(input,
     out = helper.create_variable_for_type_inference(dtype)
     last_h = helper.create_variable_for_type_inference(dtype)
     last_c = helper.create_variable_for_type_inference(dtype)
-
-    cache = helper.create_variable(
-        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
+    reserve = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+    state_out = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+    state_out.persistable = True
 
     helper.append_op(
         type='cudnn_lstm',
@@ -2319,15 +2479,15 @@ def lstm(input,
             'InitH': init_h,
             'InitC': init_c,
             'W': weight,
-            'Cache': cache,
         },
         outputs={
             'Out': out,
-            'last_h': last_h,
-            'last_c': last_c,
+            'LastH': last_h,
+            'LastC': last_c,
+            'Reserve': reserve,
+            'StateOut': state_out,
         },
         attrs={
-            'max_len': max_len,
             'is_bidirec': is_bidirec,
             'input_size': input_size,
             'hidden_size': hidden_size,
@@ -3101,7 +3261,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
                              'beam_search_encode')
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
-    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
+    sentence_scores = helper.create_variable_for_type_inference(
+        dtype=scores.dtype)
 
     helper.append_op(
         type="beam_search_decode",
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2d874b4806c9e1449a170017440c4b5038ff93bf..77a78eb4a14a0a5ad9be9cff71131ca473106ab8 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -26,6 +26,7 @@ from .. import core
 from .layer_function_generator import templatedoc
 from . import utils
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.utils import deprecated
 import numpy
 import warnings
 
@@ -317,7 +318,7 @@ def concat(input, axis=0, name=None):
     if in_dygraph_mode():
         if isinstance(axis, Variable):
             axis = axis.numpy()
-            axis = axis[0]
+            axis = axis.item(0)
         return core.ops.concat(input, 'axis', axis)
 
     check_type(input, 'input', (list, tuple, Variable), 'concat')
@@ -642,7 +643,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         shape(list|tuple|Tensor): Shape of the output Tensor, the data type of ``shape`` is int32 or int64.
             If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Tensor which can
+        dtype(np.dtype|str): Data type of the output Tensor which can
             be float16, float32, float64, int32, int64.
         value(bool|float|int|Tensor): The constant value used to initialize 
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
@@ -699,9 +700,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
 
         if isinstance(value, Variable):
             if dtype in ['int64', 'int32']:
-                attrs['str_value'] = str(int(value.numpy()))
+                attrs['str_value'] = str(int(value.numpy().item(0)))
             else:
-                attrs['str_value'] = str(float(value.numpy()))
+                attrs['str_value'] = str(float(value.numpy().item(0)))
 
         core.ops.fill_constant(out, 'value',
                                float(value), 'force_cpu', force_cpu, 'dtype',
@@ -746,6 +747,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     return out
 
 
+@deprecated(since='1.8.0', update_to="paddle.fill_constant")
 @templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
@@ -1040,7 +1042,7 @@ def ones(shape, dtype, force_cpu=False):
 
     Parameters:
         shape(tuple|list|Tensor): Shape of output Tensor, the data type of shape is int32 or int64.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of output Tensor, it supports
+        dtype (np.dtype|str): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64.
         force_cpu (bool, optional): Whether force to store the output Tensor in CPU memory.
             If :attr:`force_cpu` is False, the output Tensor will be stored in running device memory.
@@ -1073,7 +1075,7 @@ def zeros(shape, dtype, force_cpu=False, name=None):
 
     Parameters:
         shape(tuple|list|Tensor): Shape of output Tensor, the data type of ``shape`` is int32 or int64.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of output Tensor, it supports
+        dtype (np.dtype|str): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64.
         force_cpu (bool, optional): Whether force to store the output Tensor in CPU memory.
             If :attr:`force_cpu` is False, the output Tensor will be stored in running device memory.
@@ -1435,14 +1437,14 @@ def linspace(start, stop, num, dtype=None, name=None):
     This OP return fixed number of evenly spaced values within a given interval.
 
     Args:
-        start(float|Tensor): The input :attr:`start` is start variable of range. It is a float scalar, \
-            or a Tensor of shape [1] with input data type float32, float64.
-        stop(float|Tensor): The input :attr:`stop` is start variable of range. It is a float scalar, \
-            or a Tensor of shape [1] with input data type float32, float64.
+        start(int|float|Tensor): The input :attr:`start` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
+        stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
         num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \
-            or a Tensor of shape [1] with data type int32.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output tensor, it could be 'float32' and 'float64'.
-            Default: if None, the data type is float32.
+            or a Tensor of shape [1] with data type int32 or int64.
+        dtype(np.dtype|str, optional): The data type of output tensor, it could be
+            int32, int64, float32 and float64. Default: if None, the data type is float32.
         name(str, optional): Normally there is no need for user to set this property. 
             For more information, please refer to :ref:`api_guide_Name`.Default: None.
 
@@ -1452,9 +1454,11 @@ def linspace(start, stop, num, dtype=None, name=None):
         the value with input :attr:`start`. 
 
     Raises:
-        TypeError: The ``dtype`` must be one of float32 and float64.
-        TypeError: The data type of ``start`` and ``stop``  must be one of float32 and float64.
-        TypeError: The data type of ``num`` must be one of int32 and int64.
+        TypeError: The ``dtype`` must be one of int32, int64, float32 and float64.
+        TypeError: The type of ``num`` must be int When it's not a Tensor.
+        TypeError: The data type of ``num`` must be int32  When it's  a Tensor.
+        TypeError: The data type of ``start`` and  ``stop`` must be same as ``dtype`` When it's  a Tensor.
+
 
 
     Examples:
@@ -1467,29 +1471,47 @@ def linspace(start, stop, num, dtype=None, name=None):
     """
     if dtype is None:
         dtype = 'float32'
+    tensor_num = num
+    tensor_start = start
+    tensor_stop = stop
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
+        tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        stop = fill_constant([1], dtype, stop)
+        tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        num = fill_constant([1], 'int32', num)
+        tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
-        return core.ops.linspace(start, stop, num)
+        return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
+                                 dtype)
 
     helper = LayerHelper("linspace", **locals())
 
-    check_dtype(start.dtype, 'start', ['float32', 'float64'], 'linspace')
-    check_dtype(stop.dtype, 'stop', ['float32', 'float64'], 'linspace')
-    check_dtype(num.dtype, 'num', ['int32', 'int64'], 'linspace')
-    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'linspace')
+    if isinstance(start, Variable):
+        check_dtype(start.dtype, 'start', (convert_dtype(dtype)), 'linspace')
+    else:
+        check_type(start, 'start', (int, float), 'linspace')
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+    if isinstance(stop, Variable):
+        check_dtype(stop.dtype, 'stop', (convert_dtype(dtype)), 'linspace')
+    else:
+        check_type(stop, 'stop', (int, float), 'linspace')
+    if isinstance(num, Variable):
+        check_dtype(num.dtype, 'num', ['int32'], 'linspace')
+    else:
+        check_type(num, 'num', (int), 'linspace')
+    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
+                'linspace')
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
 
     helper.append_op(
         type='linspace',
-        inputs={'Start': start,
-                'Stop': stop,
-                'Num': num},
+        inputs={'Start': tensor_start,
+                'Stop': tensor_stop,
+                'Num': tensor_num},
+        attrs={'dtype': dtype},
         outputs={'Out': [out]})
     return out
 
@@ -1537,6 +1559,7 @@ def zeros_like(x, out=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.diag")
 def diag(diagonal):
     """
 	:alias_main: paddle.diag
@@ -1598,7 +1621,7 @@ def eye(num_rows,
             If None, default: num_rows.
         batch_shape(list, optional): If provided, the returned tensor will have a leading
             batch size of this shape, the data type of ``batch_shape`` is int. Default is None.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned tensor.
+        dtype(np.dtype|str, optional): The data type of the returned tensor.
             It should be int32, int64, float16, float32, float64, default is 'float32'.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c84d2ac3796efe9d16641552f1be939a666aa4cf..8f34576b836a5412a6792a6dfd63b3c9fd8de560 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -40,6 +40,7 @@ from paddle.fluid.layers import tensor
 from functools import reduce
 from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
+import paddle
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@@ -60,21 +61,23 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def __init__(self,
                  learning_rate,
                  parameter_list=None,
                  regularization=None,
                  grad_clip=None,
                  name=None):
+        # Because of the loop import, so place it in the function body
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         self._parameter_list = list(
             parameter_list) if parameter_list is not None else None
         self._name = name
         if framework.in_dygraph_mode():
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, LearningRateDecay):
+            if not isinstance(learning_rate,
+                              (float, LearningRateDecay, _LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or LearningRateDecay, got %s here"
+                    "learning rate should be float or _LRScheduler, got %s here"
                     % type(learning_rate))
             if self._parameter_list is None:
                 raise AttributeError(
@@ -89,11 +92,11 @@ class Optimizer(object):
                             % regularization.__str__())
                         break
         else:
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, framework.Variable):
+            if not isinstance(learning_rate,
+                              (float, framework.Variable, _LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or Variable, got %s here" %
-                    type(learning_rate))
+                    "learning rate should be float or _LRScheduler, got %s here"
+                    % type(learning_rate))
 
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
@@ -143,11 +146,15 @@ class Optimizer(object):
                     state_dict = adam.state_dict()
 
         '''
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         state_dict = {}
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
+        if isinstance(self._learning_rate, _LRScheduler):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+            return state_dict
         if isinstance(self._learning_rate, LearningRateDecay):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
 
@@ -191,6 +198,9 @@ class Optimizer(object):
                     adam.set_dict(opti_state_dict)
 
         '''
+        from paddle.optimizer.lr_scheduler import _LRScheduler
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
         if isinstance(self._learning_rate, LearningRateDecay):
             self._learning_rate.set_dict(state_dict["LR_Scheduler"])
@@ -251,6 +261,30 @@ class Optimizer(object):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
+        from paddle.optimizer.lr_scheduler import _LRScheduler
+        if isinstance(self._learning_rate, _LRScheduler):
+            lr_var = self._global_learning_rate()
+            # only create global lr_var once
+            if not isinstance(lr_var, framework.Variable):
+                lr_name = unique_name.generate('learning_rate')
+                self._learning_rate._var_name = lr_name
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype='float32' if self._dtype is None else self._dtype)
+                main_prog = framework.default_main_program()
+                main_prog.lr_sheduler = self._learning_rate
+                main_prog.lr_var = lr_var
+                self._learning_rate_map[framework.default_main_program(
+                )] = lr_var
+
+            lr_value = float(self._learning_rate())
+            self.helper.set_variable_initializer(
+                lr_var, initializer=Constant(value=lr_value))
+            return
+
         if imperative_base.enabled():
             # create learning rate Variable
             if isinstance(self._learning_rate, float):
@@ -754,7 +788,7 @@ class Optimizer(object):
                 params_grads = append_backward(loss, parameter_list,
                                                act_no_grad_set, callbacks)
                 # Note: since we can't use all_reduce_op now,
-                #  dgc_op should be the last op of one grad.
+                # dgc_op should be the last op of one grad.
                 self._append_dgc_ops(params_grads)
         return params_grads
 
@@ -863,7 +897,7 @@ class Optimizer(object):
             if p.trainable:
                 p.clear_gradient()
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -981,7 +1015,7 @@ class SGDOptimizer(Optimizer):
             name=name)
         self.type = "sgd"
 
-    @no_grad
+    @no_grad()
     def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
@@ -1141,7 +1175,7 @@ class MomentumOptimizer(Optimizer):
 
 class DGCMomentumOptimizer(Optimizer):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
 
@@ -1518,7 +1552,7 @@ class DGCMomentumOptimizer(Optimizer):
         dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
                          [param_var.name, grad_var.name])
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def apply_gradients(self, params_grads):
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
@@ -3067,7 +3101,7 @@ Lamb = LambOptimizer
 
 class ModelAverage(Optimizer):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     The ModelAverage optimizer accumulates specific continuous historical parameters
     during training. The accumulated historical range can be controlled by the passed
@@ -3376,7 +3410,7 @@ class ModelAverage(Optimizer):
 
 class ExponentialMovingAverage(object):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Compute the moving average of parameters with exponential decay.
     Given a parameter :math:`\\theta`, its exponential moving average (EMA)
@@ -3626,7 +3660,7 @@ class ExponentialMovingAverage(object):
 
 class PipelineOptimizer(object):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Pipeline Optimizer: Make a program to run as pipeline, that is splitting a
     program into multiple sections (sub-programs) and each section run on a
@@ -3690,7 +3724,8 @@ class PipelineOptimizer(object):
     def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         if framework.in_dygraph_mode():
             raise Exception("In dygraph, don't support PipelineOptimizer.")
-        if not isinstance(optimizer, Optimizer):
+        if not isinstance(optimizer, Optimizer) and not isinstance(
+                optimizer, paddle.optimizer.Optimizer):
             raise ValueError("The 'optimizer' parameter for "
                              "PipelineOptimizer must be an instance of "
                              "Optimizer, but the given type is {}.".format(
@@ -4477,7 +4512,7 @@ class PipelineOptimizer(object):
 
 class RecomputeOptimizer(Optimizer):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Recompute Optimizer Wrapper
 
@@ -4562,7 +4597,7 @@ class RecomputeOptimizer(Optimizer):
 
     def load(self, stat_dict):
         """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
         load function is not supported by Recompute Optimizer for now.
         :return: None
@@ -4786,7 +4821,7 @@ class RecomputeOptimizer(Optimizer):
 
 class LookaheadOptimizer(object):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     This implements the Lookahead optimizer of the
     paper : https://arxiv.org/abs/1907.08610.
@@ -4929,6 +4964,11 @@ class LookaheadOptimizer(object):
 
             mod = layers.elementwise_mod(step, k)
             with layers.control_flow.Switch() as switch:
+                with switch.case(step == one_var):
+                    for param_name in params:
+                        fast_var = main_block.var(param_name)
+                        slow_var = param_to_slow[param_name]
+                        layers.assign(input=fast_var, output=slow_var)
                 with switch.case(mod == zero_var):
                     for param_name in params:
                         fast_var = main_block.var(param_name)
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a45443632b04835bc8f3b3f2f167433c7a8b49d4..8e0470bededd4fdb8aec03893590bdba35bbb364 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -204,6 +204,9 @@ class WeightNormParamAttr(ParamAttr):
     """
 	:api_attr: Static Graph
 
+    Note:
+        Please use 'paddle.nn.utils.weight_norm' in dygraph mode.
+
     Parameter of weight Norm. Weight Norm is a reparameterization of the weight vectors
     in a neural network that decouples the magnitude of those weight vectors from
     their direction. Weight Norm has been implemented as discussed in this
@@ -216,6 +219,7 @@ class WeightNormParamAttr(ParamAttr):
         It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
         There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
+        
 
     Args:
         dim(int): Dimension over which to compute the norm. Dim is a non-negative
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 1f96bbc4ceeac142d802ef94a18f5ace96ce820d..76c95be75d67d60cd59efe13ecba6f01a1c1d614 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -22,8 +22,9 @@ from .framework import Program, Variable, program_guard, default_main_program, d
 from .executor import global_scope
 from .data_feeder import DataFeeder, BatchedTensorProvider
 from .multiprocess_utils import multiprocess_queue_set, CleanupFuncRegistrar, _cleanup_mmap, _cleanup, _set_SIGCHLD_handler
-from .dataloader import BatchSampler, Dataset
-from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, default_collate_fn
+from .dataloader import BatchSampler, Dataset, IterableDataset
+from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, _DatasetKind, default_collate_fn
+from .dataloader.batch_sampler import _InfiniteIterableSampler
 from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
 from .unique_name import UniqueNameGenerator
 import logging
@@ -48,6 +49,7 @@ __all__ = ['PyReader', 'DataLoader', 'default_collate_fn']
 data_loader_unique_name_generator = UniqueNameGenerator()
 
 KEEP_DATA_LOADER_ORDER = True
+USE_PINNED_MEMORY = None
 
 
 def keep_data_loader_order(*args):
@@ -59,6 +61,15 @@ def keep_data_loader_order(*args):
         KEEP_DATA_LOADER_ORDER = args[0]
 
 
+def use_pinned_memory(*args):
+    global USE_PINNED_MEMORY
+    if len(args) == 0:
+        return USE_PINNED_MEMORY
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        USE_PINNED_MEMORY = args[0]
+
+
 def _convert_places(places):
     if not isinstance(places, (list, tuple)):
         places = [places]
@@ -126,8 +137,9 @@ class DataLoader(object):
 
     Args:  
         dataset(Dataset): the dataset to load data from, should be an
-            instance of subclass of :code:`paddle.io.Dataset`.
-        feed_list (list(Variable)|tuple(Variable)): feed variable list.
+            instance of subclass of :code:`paddle.io.Dataset` or
+            :code:`paddle.io.IterableDataset`.
+        feed_list (list(Tensor)|tuple(Tensor)): feed variable list.
             The variables should be created by :code:`fluid.data()`.
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
@@ -285,6 +297,10 @@ class DataLoader(object):
 
             # -------------------------------------------------------
 
+    .. note::
+        For reading iterable dataset with multiprocess Dataloader,
+        please see :code:`paddle.io.IterableDataset`
+
     """
 
     def __init__(self,
@@ -338,6 +354,18 @@ class DataLoader(object):
         assert timeout >= 0, "timeout should be a non-negative value"
         self.timeout = timeout
 
+        if isinstance(dataset, IterableDataset):
+            self.dataset_kind = _DatasetKind.ITER
+            if shuffle:
+                raise ValueError(
+                    "IterableDataset not support shuffle, but got shuffle={}".
+                    format(shuffle))
+            if batch_sampler is not None:
+                raise ValueError(
+                    "IterableDataset expect unspecified batch_sampler")
+        else:
+            self.dataset_kind = _DatasetKind.MAP
+
         if batch_sampler is not None:
             assert isinstance(batch_sampler, BatchSampler), \
                 "batch_sampler should be None or subclass instance " \
@@ -350,11 +378,20 @@ class DataLoader(object):
             assert batch_size is not None and batch_size > 0, \
                 "batch_size should be a positive value when " \
                 "batch_sampler is not given"
-            self.batch_sampler = BatchSampler(
-                dataset=dataset,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
+            if isinstance(dataset, IterableDataset):
+                self.batch_sampler = _InfiniteIterableSampler(dataset,
+                                                              batch_size)
+            else:
+                self.batch_sampler = BatchSampler(
+                    dataset=dataset,
+                    batch_size=batch_size,
+                    shuffle=shuffle,
+                    drop_last=drop_last)
+
+        self.pin_memory = False
+        if in_dygraph_mode():
+            self.pin_memory = True if use_pinned_memory(
+            ) is None else use_pinned_memory()
 
     def __len__(self):
         return len(self.batch_sampler)
@@ -714,6 +751,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
         # mode, this thread is used to get next batch data from self._batch_reader, then 
         # push it into self._blocking_queue
         self._thread = None
+        self._pin_memory = True if use_pinned_memory(
+        ) is None else use_pinned_memory()
 
     @property
     def queue(self):
@@ -759,7 +798,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
         self._reader = None
         self._reader = core.create_py_reader(
             self.queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_double_buffer, True)
+            self._need_check_feed, self._places, self._use_double_buffer, True,
+            self._pin_memory)
 
     def _start(self):
         if self._use_multiprocess:
@@ -999,7 +1039,7 @@ class GeneratorLoader(DataLoaderBase):
         self._reader = core.create_py_reader(
             self.queue, self._var_names, self._shapes, self._dtypes,
             self._need_check_feed, self._places, self._use_double_buffer,
-            self._drop_last)
+            self._drop_last, False)
 
     def _init_non_iterable(self):
         lod_levels = []
@@ -1669,7 +1709,7 @@ class PyReader(DataLoaderBase):
 
 class DatasetLoader(DataLoaderBase):
     def __init__(self, dataset, places, drop_last):
-        assert isinstance(dataset, paddle.fleet.dataset.
+        assert isinstance(dataset, paddle.distributed.fleet.dataset.
                           DatasetBase), "dataset must be type of DatasetBase"
         assert not in_dygraph_mode(
         ), "DatasetLoader is not supported in dygraph mode yet"
@@ -1685,7 +1725,7 @@ class DatasetLoader(DataLoaderBase):
 
         dataset.set_thread(thread_num)
 
-        if isinstance(dataset, paddle.fleet.dataset.
+        if isinstance(dataset, paddle.distributed.fleet.dataset.
                       InMemoryDataset) and dataset.queue_num > thread_num:
             logging.warn("queue_num {} which is set in Dataset is ignored".
                          format(dataset.queue_num))
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
old mode 100755
new mode 100644
index 41580659d3687970d52f5bb22d324934961a4a2f..6220bf62c79c30737f923e744d5670818f54ff6e
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
+list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -22,6 +23,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
 list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
 list(APPEND MIXED_DIST_TEST_OPS test_launch)
+list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
 list(APPEND MIXED_DIST_TEST_OPS test_launch_ps)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
@@ -32,16 +34,19 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -51,6 +56,14 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
 endif()
@@ -88,10 +101,16 @@ endif()
 
 
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint3)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_multiple)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_dist_basic)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs1)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs2)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs3)
 LIST(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 if(APPLE OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_hdfs)
     LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
@@ -104,6 +123,8 @@ if (NOT ${WITH_GPU})
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+    LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
 elseif(${CUDNN_VERSION} VERSION_LESS 7100)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
@@ -188,6 +209,7 @@ function(py_test_modules TARGET_NAME)
   endif()
 endfunction()
 
+
 function(bash_test_modules TARGET_NAME)
     if(NOT WITH_TESTING)
         return()
@@ -230,6 +252,51 @@ function(bash_test_modules TARGET_NAME)
     endif()
 endfunction()
 
+function(parallel_bash_test_modules TARGET_NAME)
+    if(NOT WITH_TESTING)
+        return()
+    endif()
+
+    set(options SERIAL)
+    set(oneValueArgs TIMEOUT START_BASH)
+    set(multiValueArgs DEPS ENVS LABELS UnitTests)
+    cmake_parse_arguments(parallel_bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+
+    set(timeout 120)
+    if(${parallel_bash_test_modules_TIMEOUT})
+        set(timeout ${parallel_bash_test_modules_TIMEOUT})
+    endif()
+
+    list(JOIN  parallel_bash_test_modules_UnitTests " " uts_string)
+
+    if(WITH_COVERAGE)
+        add_test(NAME ${TARGET_NAME}
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+            WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    else()
+        add_test(NAME ${TARGET_NAME}
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+
+    if (parallel_bash_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+
+    if(parallel_bash_test_modules_LABELS)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout} LABELS ${parallel_bash_test_modules_LABELS})
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout})
+    endif()
+endfunction()
+
+
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_profiler)
@@ -264,6 +331,9 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
+list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
+
+
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
@@ -276,6 +346,8 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_static)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dynamic)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset)
 endif()
 
 if(NOT WITH_GPU OR WIN32 OR APPLE)
@@ -355,17 +427,21 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base")
 
-    # FIXME(seiriosX) will readd after PR 22957  Merged
+
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
 
+    # FIXME(seiriosX) will fix this
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
+
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
     py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
     py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
@@ -375,16 +451,19 @@ if(WITH_DISTRIBUTE)
     py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
     if(NOT APPLE)
     	   py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_graph_execution_meta_optimizer MODULES test_fleet_graph_execution_meta_optimizer ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
+	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
@@ -403,12 +482,14 @@ if(WITH_DISTRIBUTE)
         if(WITH_GPU)
             # NOTE. test_launch only work in gpu collective mode
             bash_test_modules(test_launch START_BASH test_launch.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
             py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
         endif()
 
         bash_test_modules(test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
 
+        # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
             bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
@@ -448,13 +529,20 @@ if(NOT WIN32)
 endif()
 
 if(NOT APPLE AND NOT WIN32)
-    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 600)
-    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 600)
-    bash_test_modules(test_checkpoint_saver START_BASH dist_test.sh TIMEOUT 600)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
 endif()
 
 add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
+add_subdirectory(rnn)
 
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
@@ -492,4 +580,15 @@ if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_multiprocess_dataloader_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_iterable_dataset_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_dataset PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
+
+# setting timeout value for old unittests
+# set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
diff --git a/python/paddle/fluid/tests/unittests/__init__.py b/python/paddle/fluid/tests/unittests/__init__.py
index b94a21a7e406b833797f8f521c62a2351c2bc30a..193b91cdaa13293ca920a8b79826bb71657c5d56 100644
--- a/python/paddle/fluid/tests/unittests/__init__.py
+++ b/python/paddle/fluid/tests/unittests/__init__.py
@@ -10,4 +10,15 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License.p
+
+# Note: On Windows, import form subdirectories such as dirA()->dirB(), current directory 
+# will still be dirA(), But is should be dirB(). So it will ModulNotFoundError
+# please refer to https://stackoverflow.com/questions/8953844/import-module-from-subfolder
+
+import os
+if os.name == 'nt':
+    import sys
+    dirname, filename = os.path.split(os.path.abspath(__file__))
+    sys.path.insert(0, dirname)
+    print(sys.path)
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
index 812730e9523f8d24ade68474b858e04b41fc6895..529ff4ec45d1fdc6d1d8e765e38cff53d36aade7 100644
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -30,11 +30,11 @@ from paddle.fluid import unique_name
 import numpy as np
 from paddle.io import Dataset, BatchSampler, DataLoader
 
-BATCH_NUM = 20
-BATCH_SIZE = 16
+BATCH_NUM = 4
+BATCH_SIZE = 1
 
 #IMAGE_SIZE = 128
-CLASS_NUM = 10
+CLASS_NUM = 2
 
 USE_GPU = False  # whether use GPU to run model
 places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
@@ -59,7 +59,7 @@ def sample_list_generator_creator():
         for _ in range(BATCH_NUM):
             sample_list = []
             for _ in range(BATCH_SIZE):
-                image, label = get_random_images_and_labels([16, 16], [1])
+                image, label = get_random_images_and_labels([4, 4], [1])
                 sample_list.append([image, label])
 
             yield sample_list
@@ -75,8 +75,7 @@ class AutoCheckpointBase(unittest.TestCase):
                   minimize=True,
                   iterable=True):
         def simple_net():
-            image = fluid.data(
-                name='image', shape=[-1, 16, 16], dtype='float32')
+            image = fluid.data(name='image', shape=[-1, 4, 4], dtype='float32')
             label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
 
             fc_tmp = fluid.layers.fc(image, size=CLASS_NUM)
diff --git a/python/paddle/fluid/tests/unittests/c_comm_init_op.py b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db77477cca62d10ff6692013a64a8d2ce5a38ec1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+
+
+class TestCCommInitOp(unittest.TestCase):
+    def setUp(self):
+        self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
+        self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        self.nranks = len(self.endpoints)
+        self.rank = self.endpoints.index(self.current_endpoint)
+        self.gpu_id = int(os.getenv("FLAGS_selected_gpus"))
+        self.place = fluid.CUDAPlace(self.gpu_id)
+        self.exe = fluid.Executor(self.place)
+        self.endpoints.remove(self.current_endpoint)
+        self.other_endpoints = self.endpoints
+        if self.rank == 0:
+            wait_server_ready(self.other_endpoints)
+
+    def test_specifying_devices(self):
+        program = fluid.Program()
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=fluid.unique_name.generate('nccl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': self.rank,
+                'endpoint': self.current_endpoint,
+                'other_endpoints': self.other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': self.nranks,
+                'rank': self.rank,
+                'ring_id': 0,
+                'device_id': self.gpu_id
+            })
+        self.exe.run(program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf4ca07ae9b57e083137945f58aaabb571e20ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea429ae5e3e622ee1b584796ef87edc1d4c8d72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b3c27126d926ac7175f6045f385adf4d530b44
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            paddle.distributed.barrier()
+            return []
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBarrierAPI, "barrier")
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a879a027b50688234c8efb8468e6eac660d8a145
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e89b1cb3ee8550d3dbb4e1a055f092e57126c7f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.reduce(tindata, dst=0)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduceAPI, "reduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..da61284344b58d44c5ba02af5ed42c553f857c94
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveReduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_reduce_sum",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'root_id': rootid},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduce, "reduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6904286234364e7ae84a5c21b9826885f99dc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveReduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_reduce_sum",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'use_calc_stream': True,
+                    'root_id': rootid
+                },
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduce, "reduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68929ad3b36d5a0bf145a93b30172f0422dc9f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata",
+                shape=[10, 1000],
+                dtype='float64',
+                append_batch_size=False)
+            toutdata = layers.fill_constant(
+                shape=[5, 1000], dtype='float64', value=1.0)
+            tensor_list = None
+            if rank == 1:
+                tensor_list = paddle.split(tindata, 2, axis=0)
+            paddle.distributed.scatter(toutdata, tensor_list, src=1)
+            return [toutdata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveScatterAPI, "scatter")
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..efe5e17bcce1ecddf859edbb3543876fe5fc9f89
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveScatter(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_scatter",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'root': rootid,
+                       'nranks': 2},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveScatter, "scatter", 0)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 033bc3850052199ca8da6d4588851de9c9903193..73b546b95cfeb8032c6e99eabe24c883d1f5f66c 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,7 +28,7 @@ import numpy as np
 
 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
-from paddle.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet.base.util_factory import fleet_util
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
@@ -162,30 +162,24 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
         fleet.init_worker()
-        exe.run(fleet.startup_program)
-
+        exe.run(fluid.default_startup_program())
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
 
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fleet.main_program).with_data_parallel(
-                loss_name=self.avg_cost.name,
-                build_strategy=self.strategy.get_build_strategy(),
-                exec_strategy=self.strategy.get_execute_strategy())
-
         for epoch_id in range(1):
             self.reader.start()
             try:
                 pass_start = time.time()
                 while True:
-                    loss_val = exe.run(program=compiled_prog,
+                    loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    reduce_output = fleet_util.all_reduce(
-                        np.array(loss_val), mode="sum")
-                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
-                    loss_val = float(reduce_output) / len(loss_all_trainer)
+                    # TODO(randomly fail)
+                    #   reduce_output = fleet_util.all_reduce(
+                    #       np.array(loss_val), mode="sum")
+                    #   loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    #   loss_val = float(reduce_output) / len(loss_all_trainer)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
                     fleet_util.print_on_rank(message, 0)
@@ -208,7 +202,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         exe = fluid.Executor(fluid.CPUPlace())
 
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         thread_num = 2
         batch_size = 128
@@ -217,7 +211,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             filelist.append(train_file_path)
 
         # config dataset
-        dataset = paddle.fleet.DatasetFactory().create_dataset()
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
         dataset.set_batch_size(batch_size)
         dataset.set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
@@ -230,7 +224,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             pass_start = time.time()
             dataset.set_filelist(filelist)
             exe.train_from_dataset(
-                program=fleet.main_program,
+                program=fluid.default_main_program(),
                 dataset=dataset,
                 fetch_list=[self.avg_cost],
                 fetch_info=["cost"],
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d0fa447daf3e3a502e7d77491045f92695496c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Distribute CTR model for test fleet api
+"""
+
+from __future__ import print_function
+
+import shutil
+import tempfile
+import time
+
+import paddle
+import paddle.fluid as fluid
+import os
+import numpy as np
+
+import ctr_dataset_reader
+from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
+from paddle.distributed.fleet.base.util_factory import fleet_util
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistGpuPsCTR2x2(TestDistCTR2x2):
+    """
+    For test CTR model, using Fleet api & PS-GPU
+    """
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+        exe = fluid.Executor(place)
+        fleet.init_worker()
+        exe.run(fleet.startup_program)
+
+        batch_size = 4
+        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    loss_val = exe.run(program=fleet.main_program,
+                                       fetch_list=[self.avg_cost.name])
+                    loss_val = np.mean(loss_val)
+                    reduce_output = fleet_util.all_reduce(
+                        np.array(loss_val), mode="sum")
+                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_val = float(reduce_output) / len(loss_all_trainer)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+
+        model_dir = tempfile.mkdtemp()
+        fleet.save_inference_model(
+            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
+        self.check_model_right(model_dir)
+        if fleet.is_first_worker():
+            fleet.save_persistables(executor=exe, dirname=model_dir)
+        shutil.rmtree(model_dir)
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
+        )
+
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+        exe = fluid.Executor(place)
+
+        fleet.init_worker()
+        exe.run(fleet.startup_program)
+
+        thread_num = 2
+        batch_size = 128
+        filelist = []
+        for _ in range(thread_num):
+            filelist.append(train_file_path)
+
+        # config dataset
+        dataset = paddle.fleet.DatasetFactory().create_dataset()
+        dataset.set_batch_size(batch_size)
+        dataset.set_use_var(self.feeds)
+        pipe_command = 'python ctr_dataset_reader.py'
+        dataset.set_pipe_command(pipe_command)
+
+        dataset.set_filelist(filelist)
+        dataset.set_thread(thread_num)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            dataset.set_filelist(filelist)
+            exe.train_from_dataset(
+                program=fleet.main_program,
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            if fleet.is_first_worker():
+                fleet.save_persistables(executor=exe, dirname=model_dir)
+            shutil.rmtree(model_dir)
+
+        fleet.stop_worker()
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistGpuPsCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e811408291a0a3f784ff2b744ce616d6bfbe767
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import time
+import numpy as np
+import logging
+import paddle
+import paddle.fluid as fluid
+#import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+#role = role_maker.GeneralRoleMaker(
+#init_timeout_seconds=100,
+#run_timeout_seconds=100,
+#http_ip_port="127.0.0.1:26001")
+
+#role = role_maker.PaddleCloudRoleMaker(http_ip_port="127.0.0.1:26001")
+
+#role = role_maker.GeneralRoleMaker(path="./tmp4")
+logger.info("Begin")
+res = [0, 0]
+
+logger.info(res)
+
+role = role_maker.PaddleCloudRoleMaker(path="./tmp4")
+
+fleet.init(role)
+print("init wancheng")  #
+#if fleet.is_worker():
+#    import time
+#    time.sleep(3)
+
+a = [5]
+b = [2]
+res = [0]
+if fleet.worker_index() == 0:
+    role._all_reduce(role._node_type_comm, a)
+elif fleet.worker_index() == 1:
+    role._all_reduce(role._node_type_comm, b)
+
+#logger.info(res)
+#print("res ", res)
+
+#role._barrier_all()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index c69e1247a9bb8f97350ae79bcc6df1bc645204ea..77697896b4d556da8a98c17e281b3d7a6999fd64 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -152,24 +152,18 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         batch_size = 4
 
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
 
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fleet.main_program).with_data_parallel(
-                loss_name=self.avg_cost.name,
-                build_strategy=self.strategy.get_build_strategy(),
-                exec_strategy=self.strategy.get_execute_strategy())
-
         for epoch_id in range(1):
             self.reader.start()
             try:
                 while True:
-                    loss_val = exe.run(program=compiled_prog,
+                    loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
                     print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bff108dd43665df0fc1c8b166a935946b4fbc7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+class PredictorTools(object):
+    '''
+    Paddle-Inference predictor
+    '''
+
+    def __init__(self, model_path, params_file, feeds_var):
+        '''
+        __init__
+        '''
+        self.model_path = model_path
+        self.params_file = params_file
+
+        self.feeds_var = feeds_var
+
+    def _load_model_and_set_config(self):
+        '''
+        load model from file and set analysis config 
+        '''
+        if os.path.exists(os.path.join(self.model_path, self.params_file)):
+            config = AnalysisConfig(
+                os.path.join(self.model_path, "__model__"),
+                os.path.join(self.model_path, self.params_file))
+        else:
+            config = AnalysisConfig(os.path.join(self.model_path))
+
+        if fluid.is_compiled_with_cuda():
+            config.enable_use_gpu(100, 0)
+        else:
+            config.disable_gpu()
+        config.switch_specify_input_names(True)
+        config.switch_use_feed_fetch_ops(False)
+        config.enable_memory_optim()
+        config.disable_glog_info()
+        config.switch_ir_optim(True)
+
+        return config
+
+    def _get_analysis_outputs(self, config):
+        '''
+        Return outputs of paddle inference
+        Args:
+            config (AnalysisConfig): predictor configs
+        Returns:
+            outs (numpy array): forward netwrok prediction outputs
+        '''
+        predictor = create_paddle_predictor(config)
+        tensor_shapes = predictor.get_input_tensor_shape()
+        names = predictor.get_input_names()
+        for i, name in enumerate(names):
+            #assert name in self.feeds_var, '{} not in feeded dict'.format(name)
+            shape = tensor_shapes[name]
+            tensor = predictor.get_input_tensor(name)
+            feed_data = self.feeds_var[i]
+            tensor.copy_from_cpu(np.array(feed_data))
+            if type(feed_data) == fluid.LoDTensor:
+                tensor.set_lod(feed_data.lod())
+
+        # ensure no diff in multiple repeat times
+        repeat_time = 10
+        for i in range(repeat_time):
+            predictor.zero_copy_run()
+
+        output_names = predictor.get_output_names()
+        outs = [
+            predictor.get_output_tensor(out_name).copy_to_cpu()
+            for out_name in output_names
+        ]
+
+        return outs
+
+    def __call__(self):
+        '''
+        __call__
+        '''
+        config = self._load_model_and_set_config()
+        outputs = self._get_analysis_outputs(config)
+
+        return outputs
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
index 68e6f328726f5b2664d31ac46394fa451631388c..d4646833ea2bd4e3c2c07a1962e4e866bdfe776e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
@@ -17,12 +17,13 @@ from __future__ import print_function
 import numpy
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 
 
-@declarative
+@paddle.jit.to_static
 def dyfunc_assert_variable(x):
     x_v = fluid.dygraph.to_variable(x)
     assert x_v
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index 27777a62799e104ac8a08fd67df8bdbe2a256724..f105dd5e94744ecca96ee0282432ff4946ab5e04 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -23,6 +23,8 @@ from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
 
+from predictor_utils import PredictorTools
+
 program_translator = ProgramTranslator()
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
 )
@@ -152,6 +154,12 @@ def predict_dygraph_jit(data):
         return pred_res
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, data)
+    out = output()
+    return out
+
+
 class TestBert(unittest.TestCase):
     def setUp(self):
         self.bert_config = get_bert_config()
@@ -178,9 +186,11 @@ class TestBert(unittest.TestCase):
             dygraph_pred_res = predict_dygraph(self.bert_config, data)
             static_pred_res = predict_static(data)
             dygraph_jit_pred_res = predict_dygraph_jit(data)
+            predictor_pred_res = predict_analysis_inference(data)
 
-            for dy_res, st_res, dy_jit_res in zip(
-                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
+            for dy_res, st_res, dy_jit_res, predictor_res in zip(
+                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
+                    predictor_pred_res):
                 self.assertTrue(
                     np.allclose(st_res, dy_res),
                     "dygraph_res: {},\n static_res: {}".format(
@@ -191,6 +201,11 @@ class TestBert(unittest.TestCase):
                     "dygraph_jit_res: {},\n static_res: {}".format(
                         dy_jit_res[~np.isclose(st_res, dy_jit_res)],
                         st_res[~np.isclose(st_res, dy_jit_res)]))
+                self.assertTrue(
+                    np.allclose(st_res, predictor_res),
+                    "dygraph_jit_res: {},\n static_res: {}".format(
+                        predictor_res[~np.isclose(st_res, predictor_res)],
+                        st_res[~np.isclose(st_res, predictor_res)]))
             break
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index c01705dbe9ba655d9cfb538dfdde0474ffa30855..dd58a49bb55c24a5e126965bff415d9a54cff5ad 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -15,13 +15,15 @@
 import math
 import numpy as np
 import unittest
-
+from paddle.jit import to_static
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
 from paddle.fluid.dygraph import to_variable
-from paddle.fluid.dygraph import declarative, ProgramTranslator
+from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 DATATYPE = 'float32'
 program_translator = ProgramTranslator()
@@ -240,7 +242,7 @@ class BMN(fluid.dygraph.Layer):
             param_attr=ParamAttr(name="PEM_2d4_w"),
             bias_attr=ParamAttr(name="PEM_2d4_b"))
 
-    @declarative
+    @to_static
     def forward(self, x):
         # Base Module
         x = self.b_conv1(x)
@@ -693,9 +695,11 @@ class TestTrain(unittest.TestCase):
             static_pred_res = self.predict_static(video_data)
             dygraph_pred_res = self.predict_dygraph(video_data)
             dygraph_jit_pred_res = self.predict_dygraph_jit(video_data)
+            predictor_pred_res = self.predict_analysis_inference(video_data)
 
-            for dy_res, st_res, dy_jit_res in zip(
-                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
+            for dy_res, st_res, dy_jit_res, predictor_res in zip(
+                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
+                    predictor_pred_res):
                 self.assertTrue(
                     np.allclose(st_res, dy_res),
                     "dygraph_res: {},\n static_res: {}".format(
@@ -706,6 +710,11 @@ class TestTrain(unittest.TestCase):
                     "dygraph_jit_res: {},\n static_res: {}".format(
                         dy_jit_res[~np.isclose(st_res, dy_jit_res)],
                         st_res[~np.isclose(st_res, dy_jit_res)]))
+                self.assertTrue(
+                    np.allclose(st_res, predictor_res),
+                    "dygraph_jit_res: {},\n static_res: {}".format(
+                        predictor_res[~np.isclose(st_res, predictor_res)],
+                        st_res[~np.isclose(st_res, predictor_res)]))
             break
 
     def predict_dygraph(self, data):
@@ -749,6 +758,11 @@ class TestTrain(unittest.TestCase):
 
             return pred_res
 
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.args.infer_dir, VARIABLE_FILENAME, [data])
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a689354f56757ba754b76e3d407cb7083b95b3b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle.static import InputSpec
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
+
+import unittest
+
+program_trans = ProgramTranslator()
+
+
+class SimpleNet(Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.linear = fluid.dygraph.Linear(10, 3)
+
+    @declarative(input_spec=[InputSpec(shape=[None, 10], dtype='float32')])
+    def forward(self, x, a=1, b=2):
+        y = self.inner_function(x)
+        return y
+
+    # `declarative` is not essential, add it to test for robustness.
+    @declarative
+    def inner_function(self, x):
+        y = self.linear(x)
+        return y
+
+    def add_func(self, x, y):
+        z = x + y
+        return z
+
+    @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
+    def func_with_list(self, l):
+        x, y, int_val = l
+        z = x + y
+        z = z + int_val
+        return z
+
+    @declarative(input_spec=[{
+        'x': InputSpec([None, 10]),
+        'y': InputSpec([None, 10])
+    }])
+    def func_with_dict(self, d):
+        x = d['x']
+        y = d['y']
+        int_val = d['int_val']
+
+        z = x + y
+        z = z + int_val
+
+        return z
+
+    @declarative(input_spec=[[
+        InputSpec([None]), {
+            'x': InputSpec([None, 10]),
+            'y': InputSpec([None, 10])
+        }
+    ]])
+    def func_with_list_dict(self, dl):
+        bias = dl[0]
+        x = dl[1]['x']
+        y = dl[1]['y']
+
+        z = x + y
+        z = z + bias
+
+        return z
+
+
+class TestInputSpec(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_with_input_spec(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+
+            # 1. each method holds independent program cache
+            out = net(x)
+            self.assertTrue(len(net.forward.program_cache) == 1)
+
+            # 2. test save load
+            jit.save(net, './simple_net')
+            infer_net = fluid.dygraph.jit.load('./simple_net')
+            pred = infer_net(x)
+            self.assertTrue(np.allclose(out.numpy(), pred.numpy()))
+
+            # 3. we can decorate any method
+            x_2 = to_variable(np.ones([4, 20]).astype('float32'))
+            # uses `declarative(func)` instead of `@declarative`
+            net.add_func = declarative(net.add_func)
+            out = net.add_func(x_2, np.ones([20]).astype('float32'))
+            self.assertTrue(len(net.add_func.program_cache) == 1)
+
+            # 5. test input with list
+            out = net.func_with_list([x, y, int_val])
+
+            # 6. test input with dict
+            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+
+            # 7. test input with lits contains dict
+            int_np = np.ones([1]).astype('float32')
+            out = net.func_with_list_dict([int_np, {'x': x, 'y': y}])
+
+    def test_with_error(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+
+            # 1. kwargs and input_spec should not be specificed in same time
+            with self.assertRaises(ValueError):
+                net(x, a=1, other_kwarg=2)
+
+            # 2. requires len(input_spec) <= len(args)
+            with self.assertRaises(ValueError):
+                net.add_func = declarative(
+                    net.add_func,
+                    input_spec=[
+                        InputSpec([-1, 10]), InputSpec([-1, 10]),
+                        InputSpec([10])
+                    ])
+                net.add_func(x, y)
+
+    def test_concrete_program(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+            # We can get concrete_program by specificing InputSpec information. Faking input is no need.
+            net.add_func = declarative(
+                net.add_func,
+                input_spec=[
+                    InputSpec([-1, 10]), InputSpec(
+                        [-1, 10], name='y')
+                ])
+            cp1 = net.add_func.concrete_program
+            self.assertTrue(cp1.inputs[-1].shape == (-1, 10))
+            self.assertTrue(cp1.inputs[-1].name == 'y')
+
+            # generate another program
+            net.add_func = declarative(
+                net.add_func,
+                input_spec=[InputSpec([10]), InputSpec(
+                    [10], name='label')])
+            cp2 = net.add_func.concrete_program
+            self.assertTrue(cp2.inputs[-1].shape == (10, ))
+            self.assertTrue(cp2.inputs[-1].name == 'label')
+            # Note(Aurelius84): New instance will be returned if we use `declarative(foo)` every time.
+            # So number of cache program is 1.
+            self.assertTrue(len(net.add_func.program_cache) == 1)
+            self.assertTrue(cp1 != cp2)
+
+
+def foo_func(a, b, c=1, d=2):
+    z = a + b
+    return z
+
+
+class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+    def test_with_different_input(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x_data = np.ones([16, 10]).astype('float32')
+            y_data = np.ones([10]).astype('float32') * 2
+            z_data = np.ones([10]).astype('float32') * 2.2
+
+            foo = declarative(foo_func)
+
+            # [16, 10] + [10] (varbase)
+            out_1 = foo(to_variable(x_data), to_variable(y_data))
+            self.assertTrue(np.allclose(x_data + y_data, out_1.numpy()))
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy)
+            out_2 = foo(to_variable(x_data), y_data)
+            self.assertTrue(np.allclose(x_data + y_data, out_2.numpy()))
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy)
+            out_3 = foo(to_variable(x_data), z_data)
+            self.assertTrue(np.allclose(x_data + z_data, out_3.numpy()))
+            # hit cache program
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy) with other different arguments (c=3)
+            out_4 = foo(to_variable(x_data), z_data, 3)
+            self.assertTrue(np.allclose(x_data + z_data, out_4.numpy()))
+            # create a new program
+            self.assertTrue(len(foo.program_cache) == 2)
+
+    def test_get_concrete_program(self):
+
+        foo = declarative(foo_func)
+
+        # 1. specific InputSpec for `x`/`y`
+        concrete_program_1 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]))
+        print(concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 1)
+
+        # 2. specific `c`/`d` explicitly with same default value
+        concrete_program_2 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]), 1, 2)
+        self.assertTrue(concrete_program_2 == concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 1)
+
+        # 3. specific `c` = 2
+        concrete_program_3 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]), c=2)
+        self.assertTrue(concrete_program_3 != concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 2)
+
+        # 4. specific x.shape = [10]
+        concrete_program_4 = foo.get_concrete_program(
+            InputSpec([10]), InputSpec([10]))
+        self.assertTrue(concrete_program_4 != concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 3)
+
+        # 5. only specific InputSpec of x
+        with self.assertRaises(ValueError):
+            concrete_program_5 = foo.get_concrete_program(InputSpec([10]))
+
+        # 6. specific unknown kwargs `e`=4
+        concrete_program_5 = foo.get_concrete_program(
+            InputSpec([10]), InputSpec([10]), e=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index c8051b3f24170693259b17456bcc124221117900..af1e44ffe212343c02a9bed8a8cacb0a966451aa 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -19,7 +19,7 @@ import numpy as np
 import unittest
 
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.jit import declarative
+from paddle.jit import to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 PLACE = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
@@ -76,7 +76,7 @@ class MainNetWithDict(fluid.dygraph.Layer):
         self.output_size = output_size
         self.sub_net = SubNetWithDict(hidden_size, output_size)
 
-    @declarative
+    @to_static
     def forward(self, input, max_len=4):
         input = fluid.dygraph.to_variable(input)
         cache = {
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..88697bc1b36838afd743596cfec036271be33856
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.static import InputSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+
+from test_declarative import foo_func
+
+import unittest
+
+
+class TestFunctionSpec(unittest.TestCase):
+    def test_constructor(self):
+        foo_spec = FunctionSpec(foo_func)
+        args_name = foo_spec.args_name
+        self.assertListEqual(args_name, ['a', 'b', 'c', 'd'])
+        self.assertTrue(foo_spec.dygraph_function == foo_func)
+        self.assertTrue(foo_spec.input_spec is None)
+
+    def test_verify_input_spec(self):
+        a_spec = InputSpec([None, 10], name='a')
+        b_spec = InputSpec([10], name='b')
+
+        # type(input_spec) should be list or tuple
+        with self.assertRaises(TypeError):
+            foo_spec = FunctionSpec(foo_func, input_spec=a_spec)
+
+        # each element of input_spec should be `InputSpec`
+        with self.assertRaises(ValueError):
+            foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, 10])
+
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        self.assertTrue(len(foo_spec.flat_input_spec) == 2)
+
+    def test_unified_args_and_kwargs(self):
+        foo_spec = FunctionSpec(foo_func)
+        # case 1: foo(10, 20, c=4)
+        args, kwargs = foo_spec.unified_args_and_kwargs([10, 20], {'c': 4})
+        self.assertTupleEqual(args, (10, 20, 4, 2))
+        self.assertTrue(len(kwargs) == 0)
+
+        # case 2: foo(a=10, b=20, d=4)
+        args, kwargs = foo_spec.unified_args_and_kwargs(
+            [], {'a': 10,
+                 'b': 20,
+                 'd': 4})
+        self.assertTupleEqual(args, (10, 20, 1, 4))
+        self.assertTrue(len(kwargs) == 0)
+
+        # case 3: foo(10, b=20)
+        args, kwargs = foo_spec.unified_args_and_kwargs([10], {'b': 20})
+        self.assertTupleEqual(args, (10, 20, 1, 2))
+        self.assertTrue(len(kwargs) == 0)
+
+        # assert len(self._arg_names) >= len(args)
+        with self.assertRaises(ValueError):
+            foo_spec.unified_args_and_kwargs([10, 20, 30, 40, 50], {'c': 4})
+
+        # assert arg_name should be in kwargs
+        with self.assertRaises(ValueError):
+            foo_spec.unified_args_and_kwargs([10], {'c': 4})
+
+    def test_args_to_input_spec(self):
+        a_spec = InputSpec([None, 10], name='a')
+        b_spec = InputSpec([10], name='b')
+
+        a_tensor = paddle.static.data(name='a_var', shape=[4, 10])
+        b_tensor = paddle.static.data(name='b_var', shape=[4, 10])
+        kwargs = {'c': 1, 'd': 2}
+
+        # case 1
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        input_with_spec = foo_spec.args_to_input_spec(
+            (a_tensor, b_tensor, 1, 2), {})
+        self.assertTrue(len(input_with_spec) == 4)
+        self.assertTrue(input_with_spec[0] == a_spec)  # a
+        self.assertTrue(input_with_spec[1] == b_spec)  # b
+        self.assertTrue(input_with_spec[2] == 1)  # c
+        self.assertTrue(input_with_spec[3] == 2)  # d
+
+        # case 2
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec])
+        input_with_spec = foo_spec.args_to_input_spec((a_tensor, b_tensor), {})
+        self.assertTrue(len(input_with_spec) == 2)
+        self.assertTrue(input_with_spec[0] == a_spec)  # a
+        self.assertTupleEqual(input_with_spec[1].shape, (4, 10))  # b.shape
+        self.assertEqual(input_with_spec[1].name, 'b_var')  # b.name
+
+        # case 3
+        # assert kwargs is None if set `input_spec`
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec])
+        with self.assertRaises(ValueError):
+            input_with_spec = foo_spec.args_to_input_spec((a_tensor, b_tensor),
+                                                          {'c': 4})
+
+        # case 4
+        # assert len(args) >= len(self._input_spec)
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        with self.assertRaises(ValueError):
+            input_with_spec = foo_spec.args_to_input_spec((a_tensor, ), {})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index fdf6daf6263e2bb7cf8ef2c3ad1373fb079f0037..0e2bac9fa5b5c9e47ce8a08b0187531a3b83dcee 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -27,6 +27,8 @@ from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 
 program_translator = ProgramTranslator()
@@ -536,6 +538,7 @@ class TestLACModel(unittest.TestCase):
             dy_pre = self.predict_dygraph(batch)
             st_pre = self.predict_static(batch)
             dy_jit_pre = self.predict_dygraph_jit(batch)
+            predictor_pre = self.predict_analysis_inference(batch)
             self.assertTrue(
                 np.allclose(dy_pre, st_pre),
                 msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -543,6 +546,10 @@ class TestLACModel(unittest.TestCase):
                 np.allclose(dy_jit_pre, st_pre),
                 msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
                                                                st_pre))
+            self.assertTrue(
+                np.allclose(predictor_pre, st_pre),
+                msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                                  st_pre))
 
     def predict_dygraph(self, batch):
         words, targets, length = batch
@@ -591,6 +598,14 @@ class TestLACModel(unittest.TestCase):
 
             return pred_res.numpy()
 
+    def predict_analysis_inference(self, batch):
+        words, targets, length = batch
+
+        output = PredictorTools(self.args.model_save_dir, VARIABLE_FILENAME,
+                                [words, length])
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..214cd95d3bc620b3bcadb88e57c7e54a593eaaf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import logging
+import os
+import sys
+import unittest
+
+import gast
+import six
+
+import paddle
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
+
+# TODO(liym27): library mock needs to be installed separately in PY2,
+#  but CI environment has not installed mock yet.
+#  After discuss with Tian Shuo, now use mock only in PY3, and use it in PY2 after CI installs it.
+if six.PY3:
+    from unittest import mock
+# else:
+#     import mock
+
+
+class TestLoggingUtils(unittest.TestCase):
+    def setUp(self):
+        self.verbosity_level = 1
+        self.code_level = 3
+        self.translator_logger = logging_utils._TRANSLATOR_LOGGER
+
+    def test_verbosity(self):
+        paddle.jit.set_verbosity(None)
+        os.environ[logging_utils.VERBOSITY_ENV_NAME] = '3'
+        self.assertEqual(logging_utils.get_verbosity(), 3)
+
+        paddle.jit.set_verbosity(self.verbosity_level)
+        self.assertEqual(self.verbosity_level, logging_utils.get_verbosity())
+
+        # String is not supported
+        with self.assertRaises(TypeError):
+            paddle.jit.set_verbosity("3")
+
+        with self.assertRaises(TypeError):
+            paddle.jit.set_verbosity(3.3)
+
+    def test_code_level(self):
+
+        paddle.jit.set_code_level(None)
+        os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
+        self.assertEqual(logging_utils.get_code_level(), 2)
+
+        paddle.jit.set_code_level(self.code_level)
+        self.assertEqual(logging_utils.get_code_level(), self.code_level)
+
+        paddle.jit.set_code_level(9)
+        self.assertEqual(logging_utils.get_code_level(), 9)
+
+        with self.assertRaises(TypeError):
+            paddle.jit.set_code_level(3.3)
+
+    def test_log(self):
+        stream = io.BytesIO() if six.PY2 else io.StringIO()
+        log = self.translator_logger.logger
+        stdout_handler = logging.StreamHandler(stream)
+        log.addHandler(stdout_handler)
+
+        warn_msg = "test_warn"
+        error_msg = "test_error"
+        log_msg_1 = "test_log_1"
+        log_msg_2 = "test_log_2"
+
+        if six.PY3:
+            with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.warn(warn_msg)
+                logging_utils.error(error_msg)
+                self.translator_logger.verbosity_level = 2
+                logging_utils.log(1, log_msg_1)
+                logging_utils.log(2, log_msg_2)
+
+            result_msg = '\n'.join([warn_msg, error_msg, log_msg_2, ""])
+            self.assertEqual(result_msg, stream.getvalue())
+
+    def test_log_transformed_code(self):
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+
+        stream = io.BytesIO() if six.PY2 else io.StringIO()
+        log = self.translator_logger.logger
+        stdout_handler = logging.StreamHandler(stream)
+        log.addHandler(stdout_handler)
+
+        if six.PY3:
+            with mock.patch.object(sys, 'stdout', stream):
+                paddle.jit.set_code_level(1)
+                logging_utils.log_transformed_code(1, ast_code,
+                                                   "BasicApiTransformer")
+
+                paddle.jit.set_code_level()
+                logging_utils.log_transformed_code(
+                    logging_utils.LOG_AllTransformer, ast_code,
+                    "All Transformers")
+
+            self.assertIn(source_code, stream.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index b8aa0379638fadd19b4956a56c1a3e4811558535..1ef3bd1bf150056816283c83fa3ff6af1e589732 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -25,10 +25,11 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 
 
@@ -100,7 +101,7 @@ class MNIST(fluid.dygraph.Layer):
                     loc=0.0, scale=scale)),
             act="softmax")
 
-    @declarative
+    @paddle.jit.to_static
     def forward(self, inputs, label=None):
         x = self.inference(inputs)
         if label is not None:
@@ -132,7 +133,7 @@ class TestMNIST(unittest.TestCase):
             drop_last=True)
 
 
-class TestMNISTWithDeclarative(TestMNIST):
+class TestMNISTWithToStatic(TestMNIST):
     """
     Tests model if doesn't change the layers while decorated
     by `dygraph_to_static_output`. In this case, everything should
@@ -145,7 +146,7 @@ class TestMNISTWithDeclarative(TestMNIST):
     def train_dygraph(self):
         return self.train(to_static=False)
 
-    def test_mnist_declarative(self):
+    def test_mnist_to_static(self):
         dygraph_loss = self.train_dygraph()
         static_loss = self.train_static()
         self.assertTrue(
@@ -220,6 +221,10 @@ class TestMNISTWithDeclarative(TestMNIST):
             dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
                 infer_model_path, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
+            # load in Paddle-Inference
+            predictor_infer_out = self.predictor_load_and_run_inference_analysis(
+                infer_model_path, inputs)
+            self.assertTrue(np.allclose(gt_out.numpy(), predictor_infer_out))
 
     @switch_to_static_graph
     def jit_load_and_run_inference_static(self, model_path, inputs):
@@ -241,6 +246,11 @@ class TestMNISTWithDeclarative(TestMNIST):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
+    def predictor_load_and_run_inference_analysis(self, model_path, inputs):
+        output = PredictorTools(model_path, VARIABLE_FILENAME, inputs)
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index ef0f6e7f0831eea8d2f694413c5231ecea292ff4..5ec3de5871dd6787c06938a8b771f7d14e54e1e0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -23,6 +23,8 @@ from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
 import unittest
 
+from predictor_utils import PredictorTools
+
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
 #        some algorithm results are non-deterministic, like convolution algorithms.
@@ -550,6 +552,12 @@ def predict_dygraph_jit(args, data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(args, data):
+    output = PredictorTools(args.model_save_path, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestMobileNet(unittest.TestCase):
     def setUp(self):
         self.args = Args()
@@ -577,12 +585,18 @@ class TestMobileNet(unittest.TestCase):
         dy_pre = predict_dygraph(self.args, image)
         st_pre = predict_static(self.args, image)
         dy_jit_pre = predict_dygraph_jit(self.args, image)
+        predictor_pre = predict_analysis_inference(self.args, image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(
+                predictor_pre, st_pre, atol=1e-5),
+            msg="inference_pred_res:\n {}\n, st_pre: \n{}.".format(
+                predictor_pre, st_pre))
 
     def test_mobile_net(self):
         # MobileNet-V1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 3da60e955deee9b6d4c74ba5ff1a550ae135afdb..f0fbe54f9dbbf93121655e784601467c13b3a70d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -133,7 +133,7 @@ class TestWithTrainAndEval(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             linear_net(x)
 
-            _, partial_layer = program_translator.get_program_cache().last()[-1]
+            _, partial_layer = linear_net.forward.program_cache.last()[-1]
             # check default mode is for training
             self.assertEqual(partial_layer.program,
                              partial_layer._train_program)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 90d210eba1e0fb1eeaf5eb0c8cbc0ff46c35328f..46eb2b42e9265ac7f6340ee0be3a7127e5246eef 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -26,12 +26,15 @@ from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 IMAGENET1000 = 1281167
 base_lr = 0.001
 momentum_rate = 0.9
 l2_decay = 1e-4
-batch_size = 8
+# NOTE: Reduce batch_size from 8 to 2 to avoid unittest timeout.
+batch_size = 2
 epoch_num = 1
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
@@ -306,6 +309,12 @@ def predict_dygraph_jit(data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestResnet(unittest.TestCase):
     def train(self, to_static):
         program_translator.enable(to_static)
@@ -316,12 +325,17 @@ class TestResnet(unittest.TestCase):
         dy_pre = predict_dygraph(image)
         st_pre = predict_static(image)
         dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
 
     def test_resnet(self):
         static_loss = self.train(to_static=True)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 0386b7c7a17a0f93040fa18d688347f30f27850d..6cf59c030c00384b225d5d13160f68a3558084b9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -133,7 +133,7 @@ class TestPartialProgramRaiseError(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             out = net(x)
 
-            program_cache = program_translator.get_program_cache()
+            program_cache = SimpleFcLayer.forward.program_cache
             _, (concrete_program, _) = program_cache.last()
 
             params = concrete_program.parameters
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index c34e9478c8eab38c429c01db5fae460eeac6a4bd..30cba78fec19c169966e85ff43e79c3a00889616 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -26,6 +26,8 @@ from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 np.random.seed(SEED)
 
@@ -434,6 +436,12 @@ def predict_dygraph_jit(data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestSeResnet(unittest.TestCase):
     def setUp(self):
         self.train_reader = paddle.batch(
@@ -447,12 +455,17 @@ class TestSeResnet(unittest.TestCase):
         dy_pre = predict_dygraph(image)
         st_pre = predict_static(image)
         dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
 
     def test_check_result(self):
         pred_1, loss_1, acc1_1, acc5_1 = train(
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
similarity index 61%
rename from python/paddle/fluid/tests/unittests/test_hdfs.py
rename to python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index e0e6c8c14f6db0685df517b0c1d84957f4d36028..6a752bc3053d7d0672bd0002250252c3bbbfa1e1 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -19,12 +19,12 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
 
-class FSTest(unittest.TestCase):
+class FSTestBase(unittest.TestCase):
     def _test_dirs(self, fs):
         dir_path = os.path.abspath("./test_dir")
         fs.delete(dir_path)
@@ -188,106 +188,6 @@ class FSTest(unittest.TestCase):
         except Exception as e:
             pass
 
-    def test_exists(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
-        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
-        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
-        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
-        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs.py"))
-        self.assertTrue(dirs == [])
-        self.assertTrue(len(files) == 1)
-        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
-
-    def test_hdfs(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
-        self._test_rm(fs)
-        self._test_touch(fs)
-        self._test_dirs(fs)
-        self._test_upload(fs)
-
-        self._test_download(fs)
-        self._test_mkdirs(fs)
-        self._test_list_dir(fs)
-        self._test_try_upload(fs)
-        self._test_try_download(fs)
-
-    def test_local(self):
-        fs = LocalFS()
-        self._test_rm(fs)
-        self._test_touch(fs)
-        self._test_dirs(fs)
-        self._test_touch_file(fs)
-        self._test_mkdirs(fs)
-        self._test_list_dir(fs)
-        self._test_try_upload(fs)
-        self._test_try_download(fs)
-
-    def test_timeout(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=6 * 1000,
-            sleep_inter=100)
-        src = "hdfs_test_timeout"
-        dst = "new_hdfs_test_timeout"
-        fs.delete(dst)
-        fs.mkdirs(src)
-        fs.mkdirs(dst)
-        fs.mkdirs(dst + "/" + src)
-        output = ""
-        try:
-            fs.mv(src, dst, test_exists=False)
-            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
-                                                                        output))
-        except FSTimeOut as e:
-            print("execute mv {} to {} timeout".format(src, dst))
-
-        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
-        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
-        self.assertNotEqual(ret, 0)
-        print("second mv ret:{} output:{}".format(ret, output))
-
-    def test_is_dir(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
-        self.assertFalse(fs.is_dir("./test_hdfs.py"))
-        s = """
-java.io.IOException: Input/output error
- responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
-	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
-	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
-	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
-	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
-	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
-	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
-	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
-	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
-	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
-	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
-        """
-
-        print("split lines:", s.splitlines())
-        self.assertTrue(fs._test_match(s.splitlines()) != None)
-
-    def test_config(self):
-        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            config,
-            time_out=15 * 1000,
-            sleep_inter=100)
-
     def _test_list_dir(self, fs):
         fs = HDFSClient(
             "/usr/local/hadoop-2.7.7/",
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
index 7edca281fff9df02436b2cc1af5409db0ea1981d..46d574dad0d0ae1f72617c6aaf3369b16195f76b 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
@@ -77,12 +77,13 @@ class FusionGroupPassTest(PassTest):
             self.check_output_with_place(fluid.CUDAPlace(0))
 
 
-class FusionGroupPassTest1(FusionGroupPassTest):
+class FusionGroupPassComplicatedTest(FusionGroupPassTest):
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 5)
+            self.feed_vars = self._prepare_feed_vars([32, 64], dtype, 5)
 
-            tmp_0 = layers.assign(self.feed_vars[0])
+            one = layers.fill_constant(shape=[1], dtype=dtype, value=1.0)
+            tmp_0 = one * self.feed_vars[0]
             # subgraph with 9 op nodes
             tmp_1 = tmp_0 * layers.sigmoid(self.feed_vars[1]) + layers.sigmoid(
                 self.feed_vars[2]) * layers.tanh(self.feed_vars[3])
@@ -94,7 +95,7 @@ class FusionGroupPassTest1(FusionGroupPassTest):
         self.fetch_list = [tmp_2, self.grad(tmp_0)]
 
 
-class FusionGroupPassTest2(FusionGroupPassTest):
+class FusionGroupPassInplaceTest(FusionGroupPassTest):
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3)
@@ -103,15 +104,13 @@ class FusionGroupPassTest2(FusionGroupPassTest):
                     name="data3", shape=[128, 32], dtype=dtype))
 
             # subgraph with 3 op node
-            tmp_0 = self.feed_vars[0] + self.feed_vars[1]
-            tmp_1 = layers.relu(self.feed_vars[2] * tmp_0)
-            # subgraph with 2 op nodes
-            tmp_2 = layers.relu(layers.sigmoid(self.feed_vars[3]))
-            tmp_3 = layers.mul(tmp_1, tmp_2)
+            tmp_0 = self.feed_vars[0] - self.feed_vars[1]
+            tmp_1 = tmp_0 * self.feed_vars[2]
+            tmp_2 = layers.assign(tmp_1, output=tmp_0)
+            tmp_3 = layers.mul(tmp_2, self.feed_vars[3])
 
-        self.append_gradients(tmp_3)
-        self.num_fused_ops = 2
-        self.fetch_list = [tmp_3, self.grad(tmp_1)]
+        self.num_fused_ops = 1
+        self.fetch_list = [tmp_3]
 
 
 class FusionGroupPassTestFP64(FusionGroupPassTest):
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 64fee35710ae1b8690ec41b247ceb55e180b13c9..13041827ffeabd3d6b79e4f34a67bd09624e54f6 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 from multiprocessing import Pool, Process
 import os
+import socket
+from contextlib import closing
+import psutil
 
 
 def launch_func(func, env_dict):
@@ -20,3 +23,31 @@ def launch_func(func, env_dict):
         os.environ[key] = env_dict[key]
     proc = Process(target=func)
     return proc
+
+
+def wait(procs, timeout=None):
+    # wait
+    decents = []
+    for p in procs:
+        for child in psutil.Process(p.pid).children(recursive=True):
+            decents.append(child)
+
+    gone, alive = psutil.wait_procs(decents, timeout=timeout)
+    for p in alive:
+        p.kill()
+    for p in gone:
+        if p.returncode != 0:
+            sys.exit(1)
+
+
+def _find_free_port(port_set):
+    def __free_port():
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    while True:
+        port = __free_port()
+        if port not in port_set:
+            port_set.add(port)
+            return port
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 1e9c4b56093b65c545e19243f57f933b93b486ae..d904bdbfa96ae1df83a0cacde0822611ac55757e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -19,7 +19,7 @@ import numpy as np
 from scipy.special import expit
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestSigmoid
+from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestRelu6, TestSigmoid
 from paddle.fluid.tests.unittests.test_gelu_op import gelu
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
 
@@ -34,6 +34,15 @@ class TestMKLDNNReluDim2(TestRelu):
         self.dtype = np.float32
 
 
+class TestMKLDNNRelu6Dim2(TestRelu6):
+    def setUp(self):
+        super(TestMKLDNNRelu6Dim2, self).setUp()
+        self.attrs.update({"use_mkldnn": True})
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
 class TestMKLDNNLeakyReluDim2(TestLeakyRelu):
     def setUp(self):
         super(TestMKLDNNLeakyReluDim2, self).setUp()
@@ -103,13 +112,10 @@ class TestMKLDNNSwishDim2(TestSwish):
     def setUp(self):
         super(TestMKLDNNSwishDim2, self).setUp()
 
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        beta = 2.3
-        out = x * expit(beta * x)
+        self.attrs["use_mkldnn"] = True
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "beta": beta}
+    def init_dtype(self):
+        self.dtype = np.float32
 
     def init_dtype(self):
         self.dtype = np.float32
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
index bd8842da03e988c374586983d75cc3031c446906..11b453125dfdfb267fb9f0d4d98b93e08959116e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
@@ -105,8 +105,11 @@ class TestDnnlMatMulOpInt8NoScales(TestDnnlMatMulOp):
 
 
 class TestDnnlMatMulOpInt8(TestDnnlMatMulOp):
+    # Due to limitation in int8 matmul implementation
+    # on older platforms (BDW, SKX) we needed to reduce
+    # range from [-127, 127] to [-63, 63]
     def quantize(self, tensor):
-        scale = 127. / np.abs(np.amax(tensor))
+        scale = 63. / np.abs(np.amax(tensor))
         quantized = np.round(scale * tensor).astype("int8")
         return scale, quantized
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1320623f8f8422f14677a3ca629735838dc94aa8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import contextlib
+import unittest
+import numpy as np
+import six
+import pickle
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.nn import Conv2d, Pool2D, Linear, SyncBatchNorm
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+
+class TestLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(TestLayer, self).__init__()
+
+        self._conv = Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+        self._sync_batch_norm = SyncBatchNorm(num_filters)
+
+        self._conv2 = Conv2d(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+        self._sync_batch_norm2 = SyncBatchNorm(
+            num_filters,
+            weight_attr=False,
+            bias_attr=False,
+            track_running_stats=False)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._sync_batch_norm(y)
+        y = self._conv2(y)
+        y = self._sync_batch_norm2(y)
+
+        return y
+
+
+class TestSyncBatchNorm(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = TestLayer(3, 64, 7)
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.test(use_xmap=False),
+            batch_size=32,
+            drop_last=True)
+        opt = fluid.optimizer.Adam(
+            learning_rate=1e-3, parameter_list=model.parameters())
+        return model, train_reader, opt
+
+    def run_one_loop(self, model, opt, data):
+        batch_size = len(data)
+        dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                              for x in data]).astype('float32')
+        img = to_variable(dy_x_data)
+        img.stop_gradient = False
+
+        out = model(img)
+
+        out = fluid.layers.mean(out)
+
+        return out
+
+
+if __name__ == "__main__":
+    runtime_main(TestSyncBatchNorm)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ef4779f0e6f2df2f0b79f776d1e7b6c5cbf31a22..ec6b81f138321f2119a5a5aaf4b5ba9ae8f7e69b 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -34,7 +34,7 @@ class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(cls,
                                   method,
                                   use_cuda=True,
-                                  iter=50,
+                                  iter=5,
                                   batch_size=None,
                                   feed_dict=None,
                                   feed_data_reader=None,
diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9da4f035345d7f04b69a1c9483cba7022ad10baa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+unset https_proxy http_proxy
+export FLAGS_rpc_disable_reuse_port=1
+
+name=${TEST_TARGET_NAME}
+UnitTests=${UnitTests}
+TEST_TIMEOUT=${TEST_TIMEOUT}
+
+if [[ ${name}"x" == "x" ]]; then
+    echo "can't find name, please set TEST_TARGET_NAME first"
+    exit 1
+fi
+
+if [[ ${UnitTests}"x" == "x" ]]; then
+    echo "can't find UnitTests, please set TEST_TARGET_NAME first"
+    exit 1
+fi
+
+if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
+    echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
+    exit 1
+fi
+
+if [[ ${WITH_COVERAGE} == "ON" ]]; then
+    PYTHON_EXEC="python -u -m coverage run --branch -p "
+else
+    PYTHON_EXEC="python -u "
+fi
+
+run_time=$(( $TEST_TIMEOUT - 10 ))
+echo "run_time: ${run_time}"
+for ut in ${UnitTests}; do
+    echo "start ${ut}"
+    timeout -s SIGKILL ${run_time} ${PYTHON_EXEC} ./${ut}.py > ${ut}_run.log 2>&1 &
+done
+
+FAIL=0
+for job in `jobs -p`
+do
+    echo "jobs -p result:" `jobs -p`
+    echo $job
+    wait $job || let FAIL=FAIL+1
+done
+
+echo "fail_num:" $FAIL
+
+if [ "$FAIL" == "0" ];
+then
+    exit 0
+else
+    echo "FAIL! ($FAIL)"
+
+    for ut in ${UnitTests}; do
+        log=${ut}_run.log
+        echo "cat ${log}"
+        cat $log
+    done
+
+    exit 1
+fi
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fleet/runtime/__init__.py b/python/paddle/fluid/tests/unittests/rnn/__init__.py
similarity index 87%
rename from python/paddle/fleet/runtime/__init__.py
rename to python/paddle/fluid/tests/unittests/rnn/__init__.py
index f38287cf51a728011d16f735e58ec54a7cdfe0c8..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/fleet/runtime/__init__.py
+++ b/python/paddle/fluid/tests/unittests/rnn/__init__.py
@@ -11,7 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .collective_runtime import CollectiveRuntime
-
-__all__ = ["CollectiveRuntime"]
diff --git a/python/paddle/fluid/tests/unittests/rnn/convert.py b/python/paddle/fluid/tests/unittests/rnn/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f10694a4b47e8a58e2fd0db4453cafedcbbdc1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+def convert_params_for_cell(np_cell, paddle_cell):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        v.set_value(state[k])
+
+
+def convert_params_for_cell_static(np_cell, paddle_cell, place):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        scope = paddle.static.global_scope()
+        tensor = scope.find_var(v.name).get_tensor()
+        tensor.set(state[k], place)
+
+
+def convert_params_for_net(np_net, paddle_net):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell(np_layer.cell, paddle_layer.cell)
+        else:
+            convert_params_for_cell(np_layer.cell_fw, paddle_layer.cell_fw)
+            convert_params_for_cell(np_layer.cell_bw, paddle_layer.cell_bw)
+
+
+def convert_params_for_net_static(np_net, paddle_net, place):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell_static(np_layer.cell, paddle_layer.cell,
+                                           place)
+        else:
+            convert_params_for_cell_static(np_layer.cell_fw,
+                                           paddle_layer.cell_fw, place)
+            convert_params_for_cell_static(np_layer.cell_bw,
+                                           paddle_layer.cell_bw, place)
diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0b8374b95cf334b4eced550a79d7c717c07aa7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -0,0 +1,516 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+
+class LayerMixin(object):
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class LayerListMixin(LayerMixin):
+    def __init__(self, layers=None):
+        self._layers = list(layers) if layers else []
+
+    def append(self, layer):
+        self._layers.append(layer)
+
+    def __iter__(self):
+        return iter(self._layers)
+
+
+class SimpleRNNCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        if nonlinearity == 'tanh':
+            self.nonlinearity = np.tanh
+        else:
+            self.nonlinearity = lambda x: np.maximum(x, 0.)
+
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_h = hx
+        i2h = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = np.matmul(pre_h, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self.nonlinearity(i2h + h2h)
+        return h, h
+
+
+class GRUCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            3 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            3 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden = hx
+        x_gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = np.split(x_gates, 3, 1)
+        h_r, h_z, h_c = np.split(h_gates, 3, 1)
+
+        r = 1.0 / (1.0 + np.exp(-(x_r + h_r)))
+        z = 1.0 / (1.0 + np.exp(-(x_z + h_z)))
+        c = np.tanh(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+        return h, h
+
+
+class LSTMCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            4 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            4 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        return init_h, init_c
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden, pre_cell = hx
+        gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = np.split(gates, 4, -1)
+
+        i = 1.0 / (1.0 + np.exp(-chunked_gates[0]))
+        f = 1.0 / (1.0 + np.exp(-chunked_gates[1]))
+        o = 1.0 / (1.0 + np.exp(-chunked_gates[3]))
+        c = f * pre_cell + i * np.tanh(chunked_gates[2])
+        h = o * np.tanh(c)
+
+        return h, (h, c)
+
+
+def sequence_mask(lengths, max_len=None):
+    if max_len is None:
+        max_len = np.max(lengths)
+    else:
+        assert max_len >= np.max(lengths)
+    return np.arange(max_len) < np.expand_dims(lengths, -1)
+
+
+def update_state(mask, new, old):
+    if not isinstance(old, (tuple, list)):
+        return np.where(mask, new, old)
+    else:
+        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+
+
+def rnn(cell,
+        inputs,
+        initial_states,
+        sequence_length=None,
+        time_major=False,
+        is_reverse=False):
+    if not time_major:
+        inputs = np.transpose(inputs, [1, 0, 2])
+    if is_reverse:
+        inputs = np.flip(inputs, 0)
+
+    if sequence_length is None:
+        mask = None
+    else:
+        mask = np.transpose(sequence_mask(sequence_length), [1, 0])
+        mask = np.expand_dims(mask, -1)
+        if is_reverse:
+            mask = np.flip(mask, 0)
+
+    time_steps = inputs.shape[0]
+    state = initial_states
+    outputs = []
+    for t in range(time_steps):
+        x_t = inputs[t]
+        if mask is not None:
+            m_t = mask[t]
+            y, new_state = cell(x_t, state)
+            y = np.where(m_t, y, 0.)
+            outputs.append(y)
+            state = update_state(m_t, new_state, state)
+        else:
+            y, new_state = cell(x_t, state)
+            outputs.append(y)
+            state = new_state
+
+    outputs = np.stack(outputs)
+    final_state = state
+
+    if is_reverse:
+        outputs = np.flip(outputs, 0)
+    if not time_major:
+        outputs = np.transpose(outputs, [1, 0, 2])
+    return outputs, final_state
+
+
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False):
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = np.concatenate((outputs_fw, outputs_bw), -1)
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
+def flatten(nested):
+    return list(_flatten(nested))
+
+
+def _flatten(nested):
+    for item in nested:
+        if isinstance(item, (list, tuple)):
+            for subitem in _flatten(item):
+                yield subitem
+        else:
+            yield item
+
+
+def unstack(array, axis=0):
+    num = array.shape[axis]
+    sub_arrays = np.split(array, num, axis)
+    return [np.squeeze(sub_array, axis) for sub_array in sub_arrays]
+
+
+def dropout(array, p=0.5):
+    if p == 0.0:
+        return array
+
+    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
+    return array * (mask / (1 - p))
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return np.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [np.stack(item) for item in componnets]
+
+
+class RNN(LayerMixin):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        final_outputs, final_states = rnn(self.cell,
+                                          inputs,
+                                          initial_states=initial_states,
+                                          sequence_length=sequence_length,
+                                          time_major=self.time_major,
+                                          is_reverse=self.is_reverse)
+        return final_outputs, final_states
+
+
+class BiRNN(LayerMixin):
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      initial_states, sequence_length,
+                                      self.time_major)
+        return outputs, final_states
+
+
+class RNNMixin(LayerListMixin):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index]
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, batch_size,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = np.zeros(state_shape, dtype)
+            else:
+                initial_states = tuple([
+                    np.zeros(state_shape, dtype)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 nonlinearity="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size)
+            cell_bw = GRUCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d2677229a03f7bdac14a93e176747ba0a5f1d6b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+from convert import convert_params_for_cell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.SimpleRNNCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.GRUCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.LSTMCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for bias in [True, False]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..948e47d5b99462c363015936f84058e222d548e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_cell_static
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNNCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRUCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTMCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp,
+                                 feed=feed_dict,
+                                 fetch_list=[y, h, c],
+                                 use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for bias in [True, False]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef297b3bb62497073fd667238cae8a83daaa4967
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+        rnn2 = paddle.nn.GRU(16,
+                             32,
+                             2,
+                             time_major=self.time_major,
+                             direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..90ed6b8b4c9075f5a3e3925bb80e24c81a37869c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net_static
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNN(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone().clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRU(16,
+                                     32,
+                                     2,
+                                     time_major=self.time_major,
+                                     direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTM(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 17e0cd0d5b18652f828af9936b07cb4122f87b97..45d39afc115d292fd79a3bbc4f609ad080f74602 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -36,7 +36,7 @@ remove_dropout = False
 # and Executor is different.
 remove_bn = False
 
-remove_cudnn_conv = False
+remove_cudnn_conv = True
 
 remove_dropout = True
 remove_bn = True
@@ -179,7 +179,7 @@ def batch_size(use_cuda):
 def iter(use_cuda):
     if use_cuda:
         return 10
-    return 2
+    return 1
 
 
 gpu_img, gpu_label = init_data(
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
old mode 100644
new mode 100755
index 124767a3364b078ea2c74795c03497f3dc24ba8c..ab61a5b3cfccb0e885debe9786ae91a9754e9345
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -22,7 +22,7 @@ from scipy.special import expit, erf
 import paddle
 import paddle.fluid as fluid
 import paddle.nn as nn
-import paddle.nn.functional as functional
+import paddle.nn.functional as F
 from paddle.fluid import compiler, Program, program_guard
 
 
@@ -118,7 +118,7 @@ class TestLogSigmoid(TestActivation):
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         out = np.log(1 / (1 + np.exp(-x)))
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -127,6 +127,48 @@ class TestLogSigmoid(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
+class TestLogSigmoidAPI(unittest.TestCase):
+    # test paddle.nn.LogSigmoid, paddle.nn.functional.logsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out1 = F.logsigmoid(x)
+            m = paddle.nn.LogSigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.logsigmoid(x)
+        m = paddle.nn.LogSigmoid()
+        out2 = m(x)
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.logsigmoid, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.logsigmoid, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            F.logsigmoid(x_fp16)
+
+
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "tanh"
@@ -149,6 +191,59 @@ class TestTanh(TestActivation, TestParameter):
         self.dtype = np.float32
 
 
+class TestTanhAPI(unittest.TestCase):
+    # test paddle.tanh, paddle.nn.tanh, paddle.nn.functional.tanh
+    def setUp(self):
+        self.dtype = 'float32'
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12], self.dtype)
+            out1 = F.tanh(x)
+            th = paddle.nn.Tanh()
+            out2 = th(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.tanh(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.tanh(x)
+        out2 = paddle.tanh(x)
+        th = paddle.nn.Tanh()
+        out3 = th(x)
+        out_ref = np.tanh(self.x_np)
+        for r in [out1, out2, out3]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12], self.dtype)
+            out = fluid.layers.tanh(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = np.tanh(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.tanh, 1)
+            # The input dtype must be float16, float32.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.tanh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.tanh(x_fp16)
+
+
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "atan"
@@ -327,15 +422,20 @@ class TestCoshOpError(unittest.TestCase):
             fluid.layers.cosh(x_fp16)
 
 
-class TestTanhShrink(TestActivation):
+def ref_tanhshrink(x):
+    out = x - np.tanh(x)
+    return out
+
+
+class TestTanhshrink(TestActivation):
     def setUp(self):
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
-        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
-        out = x - np.tanh(x)
+        x = np.random.uniform(10, 20, [10, 17]).astype(self.dtype)
+        out = ref_tanhshrink(x)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -344,52 +444,224 @@ class TestTanhShrink(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestTanhshrinkAPI(unittest.TestCase):
+    # test paddle.nn.Tanhshrink, paddle.nn.functional.tanhshrink
+    def setUp(self):
+        self.x_np = np.random.uniform(10, 20, [10, 17]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.tanhshrink(x)
+            tanhshrink = paddle.nn.Tanhshrink()
+            out2 = tanhshrink(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_tanhshrink(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.tanhshrink(x)
+        tanhshrink = paddle.nn.Tanhshrink()
+        out2 = tanhshrink(x)
+        out_ref = ref_tanhshrink(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.tanh_shrink(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_tanhshrink(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.tanhshrink, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.tanhshrink, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.tanhshrink(x_fp16)
+
+
+def ref_hardshrink(x, threshold):
+    out = np.copy(x)
+    out[(out >= -threshold) & (out <= threshold)] = 0
+    return out
+
+
 class TestHardShrink(TestActivation):
     def setUp(self):
         self.op_type = "hard_shrink"
         self.init_dtype()
 
-        threshold = 0.5
+        self.threshold = 0.5
+        self.set_attrs()
         x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10
-        out = np.copy(x)
-        out[(out >= -threshold) & (out <= threshold)] = 0
+        out = ref_hardshrink(x, self.threshold)
 
-        self.attrs = {'lambda': threshold}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': self.threshold}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
+    def set_attrs(self):
+        pass
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
         self.check_grad(['X'], 'Out')
 
 
-class TestHardShrinkOpError(unittest.TestCase):
+class TestHardShrink_threshold_negative(TestHardShrink):
+    def set_attrs(self):
+        self.threshold = -0.1
+
+
+class TestHardShrinkAPI(unittest.TestCase):
+    # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.hardshrink(x)
+            hd = paddle.nn.Hardshrink()
+            out2 = hd(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardshrink(self.x_np, 0.5)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.hardshrink(x)
+        hd = paddle.nn.Hardshrink()
+        out2 = hd(x)
+        out_ref = ref_hardshrink(self.x_np, 0.5)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.hardshrink(x, 0.6)
+        hd = paddle.nn.Hardshrink(0.6)
+        out2 = hd(x)
+        out_ref = ref_hardshrink(self.x_np, 0.6)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12])
+            out = fluid.layers.hard_shrink(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardshrink(self.x_np, 0.5)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_shrink, 1)
+            self.assertRaises(TypeError, F.hardshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_shrink, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_shrink(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardshrink(x_fp16)
+
+
+def ref_hardtanh(x, min=-1.0, max=1.0):
+    out = np.copy(x)
+    out[np.abs(x - min) < 0.005] = min + 0.02
+    out[np.abs(x - max) < 0.005] = max + 0.02
+    out = np.minimum(np.maximum(x, min), max)
+    return out
+
+
+class TestHardtanhAPI(unittest.TestCase):
+    # test paddle.nn.Hardtanh, paddle.nn.functional.hardtanh
+    def setUp(self):
+        self.x_np = np.random.uniform(-3, 3, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.hardtanh(x)
+            m = paddle.nn.Hardtanh()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardtanh(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.hardtanh(x)
+        m = paddle.nn.Hardtanh()
+        out2 = m(x)
+        out_ref = ref_hardtanh(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.hardtanh(x, -2.0, 2.0)
+        m = paddle.nn.Hardtanh(-2.0, 2.0)
+        out2 = m(x)
+        out_ref = ref_hardtanh(self.x_np, -2.0, 2.0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.hardtanh, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardtanh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardtanh(x_fp16)
 
 
-class TestSoftShrink(TestActivation):
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
+class TestSoftshrink(TestActivation):
     def setUp(self):
         self.op_type = "softshrink"
         self.init_dtype()
 
-        lambda_val = 0.1
-        x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
-        out = np.copy(x)
-        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
-            out - lambda_val)
+        threshold = 0.8
 
-        self.attrs = {'lambda': lambda_val}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+        out = ref_softshrink(x, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {"lambda": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -398,17 +670,59 @@ class TestSoftShrink(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestSoftShrinkOpError(unittest.TestCase):
+class TestSoftshrinkAPI(unittest.TestCase):
+    # test paddle.nn.Softshrink, paddle.nn.functional.softshrink
+    def setUp(self):
+        self.threshold = 0.8
+        self.x_np = np.random.uniform(0.25, 10, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softshrink(x, self.threshold)
+            softshrink = paddle.nn.Softshrink(self.threshold)
+            out2 = softshrink(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softshrink(x, self.threshold)
+        softshrink = paddle.nn.Softshrink(self.threshold)
+        out2 = softshrink(x)
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softshrink(x, self.threshold)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.softshrink, 1)
+            self.assertRaises(TypeError, F.softshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.softshrink, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softshrink, x_int32)
+            # The threshold must be no less than zero
+            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            self.assertRaises(ValueError, F.softshrink, x_fp32, -1.0)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.softshrink(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softshrink(x_fp16)
 
 
 class TestSqrt(TestActivation, TestParameter):
@@ -594,7 +908,7 @@ class TestRelu(TestActivation):
         x[np.abs(x) < 0.005] = 0.02
         out = np.maximum(x, 0)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -603,32 +917,72 @@ class TestRelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestReluOpError(unittest.TestCase):
+class TestReluAPI(unittest.TestCase):
+    # test paddle.nn.ReLU, paddle.nn.functional.relu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.relu(x)
+            m = paddle.nn.ReLU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.maximum(self.x_np, 0)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.relu(x)
+        m = paddle.nn.ReLU()
+        out2 = m(x)
+        out_ref = np.maximum(self.x_np, 0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.relu, 1)
+            self.assertRaises(TypeError, F.relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.relu, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[10, 12], dtype='int32')
+            self.assertRaises(TypeError, F.relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.relu(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            F.relu(x_fp16)
+
+
+def ref_leaky_relu(x, alpha=0.01):
+    out = np.copy(x)
+    out[out < 0] *= alpha
+    return out
 
 
 class TestLeakyRelu(TestActivation):
+    def get_alpha(self):
+        return 0.02
+
     def setUp(self):
         self.op_type = "leaky_relu"
         self.init_dtype()
+        alpha = self.get_alpha()
 
+        np.random.seed(10)
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0.02 * x)
+        x[np.abs(x) < 0.005] = 0.05
+        out = ref_leaky_relu(x, alpha)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
+        self.attrs = {'alpha': alpha}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -636,18 +990,78 @@ class TestLeakyRelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestLeakyReluOpError(unittest.TestCase):
+class TestLeakyReluAlpha1(TestLeakyRelu):
+    def get_alpha(self):
+        return 2
+
+
+class TestLeakyReluAlpha2(TestLeakyRelu):
+    def get_alpha(self):
+        return -0.01
+
+
+class TestLeakyReluAlpha3(TestLeakyRelu):
+    def get_alpha(self):
+        return -2.0
+
+
+class TestLeakyReluAPI(unittest.TestCase):
+    # test paddle.nn.LeakyReLU, paddle.nn.functional.leaky_relu,
+    # fluid.layers.leaky_relu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.leaky_relu(x)
+            m = paddle.nn.LeakyReLU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_leaky_relu(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.leaky_relu(x)
+        m = paddle.nn.LeakyReLU()
+        out2 = m(x)
+        out_ref = ref_leaky_relu(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.leaky_relu(x, 0.6)
+        m = paddle.nn.LeakyReLU(0.6)
+        out2 = m(x)
+        out_ref = ref_leaky_relu(self.x_np, 0.6)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12])
+            out = fluid.layers.leaky_relu(x, 0.01)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_leaky_relu(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.leaky_relu, 1)
+            self.assertRaises(TypeError, F.leaky_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.leaky_relu, x_int32)
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.leaky_relu(x_fp16)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.leaky_relu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.leaky_relu(x_fp16)
 
 
 def gelu(x, approximate):
@@ -667,7 +1081,7 @@ class TestGeluApproximate(TestActivation):
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         out = gelu(x, approximate)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {"approximate": approximate}
 
@@ -685,7 +1099,7 @@ class TestGelu(TestActivation):
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         out = gelu(x, approximate)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {"approximate": approximate}
 
@@ -695,6 +1109,55 @@ class TestGelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestGELUAPI(unittest.TestCase):
+    # test paddle.nn.GELU, paddle.nn.functional.gelu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out1 = F.gelu(x)
+            m = paddle.nn.GELU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = gelu(self.x_np, False)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.gelu(x)
+        m = paddle.nn.GELU()
+        out2 = m(x)
+        out_ref = gelu(self.x_np, False)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.gelu(x, True)
+        m = paddle.nn.GELU(True)
+        out2 = m(x)
+        out_ref = gelu(self.x_np, True)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.gelu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.gelu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            F.gelu(x_fp16)
+
+
 class TestBRelu(TestActivation):
     def setUp(self):
         self.op_type = "brelu"
@@ -734,20 +1197,24 @@ class TestBReluOpError(unittest.TestCase):
             fluid.layers.brelu(x_fp16)
 
 
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
 class TestRelu6(TestActivation):
     def setUp(self):
         self.op_type = "relu6"
         self.init_dtype()
 
         x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
-        threshold = 6.0
-        # The same with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
-        out = np.minimum(np.maximum(x, 0), threshold)
+        out = ref_relu6(x)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold}
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': 6.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -756,17 +1223,56 @@ class TestRelu6(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestRelu6OpError(unittest.TestCase):
+class TestRelu6API(unittest.TestCase):
+    # test paddle.nn.ReLU6, paddle.nn.functional.relu6
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 10, [10, 12]).astype(np.float64)
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.relu6(x)
+            relu6 = paddle.nn.ReLU6()
+            out2 = relu6(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_relu6(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.relu6(x)
+        relu6 = paddle.nn.ReLU6()
+        out2 = relu6(x)
+        out_ref = ref_relu6(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.relu6(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_relu6(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.relu6, 1)
+            self.assertRaises(TypeError, F.relu6, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.relu6, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.relu6, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.relu6(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.relu6(x_fp16)
 
 
 class TestHardSwish(TestActivation):
@@ -844,6 +1350,11 @@ class TestSoftReluOpError(unittest.TestCase):
             fluid.layers.soft_relu(x_fp16)
 
 
+def elu(x, alpha):
+    out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+    return out_ref.astype(x.dtype)
+
+
 class TestELU(TestActivation):
     def setUp(self):
         self.op_type = "elu"
@@ -851,7 +1362,7 @@ class TestELU(TestActivation):
 
         x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
         alpha = 1.
-        out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+        out = elu(x, alpha)
         # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
         # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
         self.inputs = {'X': x}
@@ -864,16 +1375,53 @@ class TestELU(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestELUOpError(unittest.TestCase):
+class TestELUAPI(unittest.TestCase):
+    # test paddle.nn.ELU, paddle.nn.functional.elu
+    def setUp(self):
+        self.x_np = np.random.uniform(-3, 3, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.elu(x)
+            m = paddle.nn.ELU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = elu(self.x_np, 1.0)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.elu(x)
+        m = paddle.nn.ELU()
+        out2 = m(x)
+        out_ref = elu(self.x_np, 1.0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.elu(x, 0.2)
+        m = paddle.nn.ELU(0.2)
+        out2 = m(x)
+        out_ref = elu(self.x_np, 0.2)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
     def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of elu_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.elu, x1)
-            # The input dtype of elu_op must be float16 float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.elu, x2)
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.elu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[10, 12], dtype='int32')
+            self.assertRaises(TypeError, F.elu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            F.elu(x_fp16)
 
 
 class TestReciprocal(TestActivation):
@@ -1107,16 +1655,25 @@ class TestSTanhOpError(unittest.TestCase):
             fluid.layers.stanh(x_fp16)
 
 
+def ref_softplus(x, beta=1, threshold=20):
+    x_beta = beta * x
+    out = np.select([x_beta <= threshold, x_beta > threshold],
+                    [np.log(1 + np.exp(x_beta)) / beta, x])
+    return out
+
+
 class TestSoftplus(TestActivation):
     def setUp(self):
         self.op_type = "softplus"
         self.init_dtype()
-        self.dtype = np.float64
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(1 + np.exp(x))
+        beta = 2
+        threshold = 15
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {'beta': beta, "threshold": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1125,15 +1682,72 @@ class TestSoftplus(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftplusAPI(unittest.TestCase):
+    # test paddle.nn.Softplus, paddle.nn.functional.softplus
+    def setUp(self):
+        self.beta = 2
+        self.threshold = 15
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softplus(x, self.beta, self.threshold)
+            softplus = paddle.nn.Softplus(self.beta, self.threshold)
+            out2 = softplus(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softplus(self.x_np, self.beta, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softplus(x, self.beta, self.threshold)
+        softplus = paddle.nn.Softplus(self.beta, self.threshold)
+        out2 = softplus(x)
+        out_ref = ref_softplus(self.x_np, self.beta, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softplus(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softplus(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softplus, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softplus, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softplus(x_fp16)
+
+
+def ref_softsign(x):
+    out = np.divide(x, 1 + np.abs(x))
+    return out
+
+
 class TestSoftsign(TestActivation):
     def setUp(self):
         self.op_type = "softsign"
         self.init_dtype()
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.divide(x, 1 + np.abs(x))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_softsign(x)
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1142,6 +1756,57 @@ class TestSoftsign(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftsignAPI(unittest.TestCase):
+    # test paddle.nn.Softsign, paddle.nn.functional.softsign
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softsign(x)
+            softsign = paddle.nn.Softsign()
+            out2 = softsign(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softsign(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softsign(x)
+        softsign = paddle.nn.Softsign()
+        out2 = softsign(x)
+        out_ref = ref_softsign(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softsign(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softsign(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softsign, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softsign, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softsign(x_fp16)
+
+
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
         self.op_type = "thresholded_relu"
@@ -1337,9 +2002,9 @@ create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
-create_test_act_fp16_class(TestTanhShrink)
+create_test_act_fp16_class(TestTanhshrink)
 create_test_act_fp16_class(TestHardShrink)
-create_test_act_fp16_class(TestSoftShrink)
+create_test_act_fp16_class(TestSoftshrink)
 create_test_act_fp16_class(TestSqrt)
 create_test_act_fp16_class(TestAbs)
 create_test_act_fp16_class(TestCeil, grad_check=False)
@@ -1372,140 +2037,5 @@ create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish)
 create_test_act_fp16_class(TestHardSwish)
 
-
-class TestNNReluAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 12]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return np.maximum(x, 0)
-
-    def ref_backward(self, y, dy):
-        y_t = y.copy()
-        y_t[y_t > 0] = 1
-        return y_t * dy
-
-    def check_api(self, place=fluid.CPUPlace(), inplace=False):
-        main_program = Program()
-        myrelu = nn.ReLU(inplace)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            x.stop_gradient = False
-            y = myrelu(x)
-            fluid.backward.append_backward(fluid.layers.mean(y))
-        exe = fluid.Executor(place)
-        out = exe.run(main_program,
-                      feed={'x': self.x},
-                      fetch_list=[y, y.grad_name, x.grad_name])
-        self.assertTrue(np.allclose(out[0], self.y))
-        self.assertTrue(np.allclose(out[2], self.ref_backward(self.y, out[1])))
-
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = myrelu(x)
-        self.assertTrue(np.allclose(y.numpy(), self.y))
-
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_api(place, inplace)
-
-
-class TestNNFunctionalReluAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 12]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return np.maximum(x, 0)
-
-    def test_check_api(self):
-        main_program = Program()
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.relu(x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], self.y))
-
-
-class TestNNSigmoidAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 15]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return 1 / (1 + np.exp(-x))
-
-    def ref_backward(self, y, dy):
-        return dy * y * (1 - y)
-
-    def check_api(self, place=fluid.CPUPlace(), inplace=False):
-        main_program = Program()
-        mysigmoid = nn.Sigmoid(inplace)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            x.stop_gradient = False
-            y = mysigmoid(x)
-            fluid.backward.append_backward(fluid.layers.mean(y))
-        exe = fluid.Executor(place)
-        out = exe.run(main_program,
-                      feed={'x': self.x},
-                      fetch_list=[y, y.grad_name, x.grad_name])
-        self.assertTrue(np.allclose(out[0], self.y))
-        self.assertTrue(np.allclose(out[2], self.ref_backward(self.y, out[1])))
-
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = mysigmoid(x)
-        self.assertTrue(np.allclose(y.numpy(), self.y))
-
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_api(place, inplace)
-
-
-class TestNNFunctionalSigmoidAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 15]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return 1 / (1 + np.exp(-x))
-
-    def test_check_api(self):
-        main_program = Program()
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.sigmoid(x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], self.y))
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 7a7099b7113c8233fb94074519386f9e4270a019..d4aafcd27a5aceb3c0b5fa9ddf8343d404bddbf5 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -20,6 +20,7 @@ from op_test import OpTest
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
+import paddle
 
 
 class TestAdamOp1(OpTest):
@@ -401,46 +402,107 @@ class TestAdamOpBetaVariable(OpTest):
         self.check_output()
 
 
-class TestAdamOptimizerBetaVariable(unittest.TestCase):
-    def test_adam_optimizer(self):
-        def test_with_place(place, shape):
-            exe = fluid.Executor(place)
-
-            train_prog = fluid.Program()
-            startup = fluid.Program()
-            with fluid.program_guard(train_prog, startup):
-                with fluid.unique_name.guard():
-                    data = fluid.data(name="data", shape=shape)
-                    conv = fluid.layers.conv2d(data, 8, 3)
-                    loss = fluid.layers.reduce_mean(conv)
-
-                    beta1 = fluid.layers.create_global_var(
-                        shape=[1],
-                        value=0.85,
-                        dtype='float32',
-                        persistable=True)
-                    beta2 = fluid.layers.create_global_var(
-                        shape=[1],
-                        value=0.95,
-                        dtype='float32',
-                        persistable=True)
-                    opt = fluid.optimizer.Adam(
-                        learning_rate=1e-5, beta1=beta1, beta2=beta2)
-                    opt.minimize(loss)
-
-            exe.run(startup)
-            data_np = np.random.random(shape).astype('float32')
-            rets = exe.run(train_prog,
-                           feed={"data": data_np},
-                           fetch_list=[loss])
-            assert rets[0] is not None
-
+class TestAdamOpV2(unittest.TestCase):
+    def test_adam_op(self):
+        place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            test_with_place(place, shape)
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = fluid.layers.reduce_mean(conv)
+
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True)
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.Adam(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+    def test_adam_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        linear = fluid.Linear(13, 5, dtype="float32")
+
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adam_op_with_state_dict(self):
+
+        import paddle
+        paddle.disable_static()
+        emb = paddle.nn.Embedding([10, 10])
+
+        adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+        #learning_rate is _LRScheduler
+        learning_rate = paddle.optimizer.CosineAnnealingLR(
+            learning_rate=0.1, T_max=10)
+        adam = paddle.optimizer.Adam(
+            learning_rate=learning_rate,
+            weight_decay=fluid.regularizer.L2Decay(0.001),
+            parameters=emb.parameters())
+        lr = adam.get_lr()
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+        #leanrning_rate is Tensor
+        with self.assertRaises(TypeError):
+            learning_rate = np.array([0.01]).astype("float32")
+            learning_rate = paddle.to_tensor(learning_rate)
+            adam = paddle.optimizer.Adam(
+                learning_rate=learning_rate, parameters=emb.parameters())
+
+        params = adam.get_opti_var_name_list()
+        assert (params is not None)
+
+    def test_adam_with_grad_clip(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        linear = fluid.Linear(13, 5, dtype="float32")
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        adam = paddle.optimizer.Adam(
+            0.1, parameters=linear.parameters(), grad_clip=clip)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adam_op_with_set_lr(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+        lr = 0.01
+        adam.set_lr(lr)
+        cur_lr = adam.get_lr()
+        assert (lr == cur_lr)
+        with self.assertRaises(TypeError):
+            lr_var = paddle.create_global_var(
+                shape=[1], value=lr, dtype='float32')
+            adam.set_lr(lr_var)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a33e11d2862c037639b1643a2e44ff81a757053
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestAdamaxAPI(unittest.TestCase):
+    def test_adamax_api_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.Adamax(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adamax_api(self):
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+                beta1 = 0.85
+                beta2 = 0.95
+                opt = paddle.optimizer.Adamax(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a7cf54e2e0f15e51ba1b6f7526837f53c7cc2e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.fluid as fluid
+
+
+class TestAdamWOp(unittest.TestCase):
+    def test_adamw_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adamw_op_coverage(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.0,
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        assert (adam.__str__() is not None)
+
+    def test_adamw_op(self):
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True)
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.AdamW(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c30e3d2ade0725e6debcdd0a69ca4eee622aec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -0,0 +1,274 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="avg"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="avg",
+            data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[2, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[2, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="avg",
+            data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[3, 3], data_format="NHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_avg_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[3, 3], data_format="NHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_avg_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..c04ee660667edaff01d7029e83b912c05429a15f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='avg'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="avg",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="avg",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3], data_format="NDHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_avg_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3], data_format="NDHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_avg_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 0bcdc45a2ccd0fd240c42b68a657557e50e4dc02..6e66c0c0029accdcdf81ae67dff1a49e3e8867d4 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -240,13 +240,13 @@ class TestAddMMAPI(unittest.TestCase):
         data_y = np.ones((2, 2)).astype(np.float32)
         data_input = np.ones((2, 2)).astype(np.float32)
 
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         def test_error1():
             data_x_wrong = np.ones((2, 3)).astype(np.float32)
-            x = paddle.imperative.to_variable(data_x_wrong)
-            y = paddle.imperative.to_variable(data_y)
-            input = paddle.imperative.to_variable(data_input)
+            x = paddle.to_variable(data_x_wrong)
+            y = paddle.to_variable(data_y)
+            input = paddle.to_variable(data_input)
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
         self.assertRaises(ValueError, test_error1)
 '''
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
index c524fb6930d97c0eb2971d09e751a54628d41325..6157314b1f060577a7d058f0de9a42f6368947ff 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -63,7 +63,7 @@ class TestAffineChannelOp(OpTest):
         self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
 
     def init_test_case(self):
-        self.shape = [2, 100, 12, 12]
+        self.shape = [2, 100, 3, 3]
         self.C = 100
         self.layout = 'NCHW'
 
@@ -102,7 +102,7 @@ class TestAffineChannelOpError(unittest.TestCase):
 
 class TestAffineChannelNHWC(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [2, 12, 12, 100]
+        self.shape = [2, 3, 3, 100]
         self.C = 100
         self.layout = 'NHWC'
 
@@ -115,7 +115,7 @@ class TestAffineChannelNHWC(TestAffineChannelOp):
 
 class TestAffineChannel2D(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [8, 100]
+        self.shape = [2, 100]
         self.C = 100
         self.layout = 'NCHW'
 
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..c874cf197ea88c7f12b9b24223d40d22be268b10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class AffineGridTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 theta_shape=(20, 2, 3),
+                 output_shape=[20, 2, 5, 7],
+                 align_corners=True,
+                 dtype="float32",
+                 invalid_theta=False,
+                 variable_output_shape=False):
+        super(AffineGridTestCase, self).__init__(methodName)
+
+        self.theta_shape = theta_shape
+        self.output_shape = output_shape
+        self.align_corners = align_corners
+        self.dtype = dtype
+        self.invalid_theta = invalid_theta
+        self.variable_output_shape = variable_output_shape
+
+    def setUp(self):
+        self.theta = np.random.randn(*(self.theta_shape)).astype(self.dtype)
+
+    def fluid_layer(self, place):
+        # align_corners = True
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                theta_var = fluid.data(
+                    "input", self.theta_shape, dtype=self.dtype)
+                y_var = fluid.layers.affine_grid(theta_var, self.output_shape)
+        feed_dict = {"input": self.theta}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                theta_var = fluid.data(
+                    "input", self.theta_shape, dtype=self.dtype)
+                y_var = F.affine_grid(
+                    theta_var,
+                    self.output_shape,
+                    align_corners=self.align_corners)
+        feed_dict = {"input": self.theta}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_dygraph_layer(self):
+        theta_var = dg.to_variable(
+            self.theta) if not self.invalid_theta else "invalid"
+        output_shape = dg.to_variable(
+            self.
+            output_shape) if self.variable_output_shape else self.output_shape
+        y_var = F.affine_grid(
+            theta_var, output_shape, align_corners=self.align_corners)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        place = fluid.CPUPlace()
+        result1 = self.fluid_layer(place)
+        result2 = self.functional(place)
+        with dg.guard(place):
+            result3 = self.paddle_dygraph_layer()
+        if self.align_corners:
+            np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class AffineGridErrorTestCase(AffineGridTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_dygraph_layer()
+
+
+def add_cases(suite):
+    suite.addTest(AffineGridTestCase(methodName='runTest'))
+    suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=True))
+
+    suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=False))
+    suite.addTest(
+        AffineGridTestCase(
+            methodName='runTest', variable_output_shape=True))
+
+    suite.addTest(
+        AffineGridTestCase(
+            methodName='runTest',
+            theta_shape=(20, 2, 3),
+            output_shape=[20, 1, 7, 7],
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        AffineGridErrorTestCase(
+            methodName='runTest', output_shape="not_valid"))
+    suite.addTest(
+        AffineGridErrorTestCase(
+            methodName='runTest',
+            invalid_theta=True))  # to test theta not variable error checking
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 3668c4f4aa174e34dcc96d40ddae7b359c1bee18..55612d71a17a7ae9801535bf5a35c83b100aab30 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -17,14 +17,20 @@ import numpy as np
 from op_test import OpTest
 
 
-def AffineGrid(theta, size):
+def AffineGrid(theta, size, align_corners):
     n = size[0]
     w = size[3]
     h = size[2]
+    h_factor = w_factor = 1
+    if not align_corners:
+        h_factor = (h - 1) / float(h)
+        w_factor = (w - 1) / float(w)
     h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+        np.linspace(-1, 1, h)[np.newaxis, :], w,
+        axis=0).T[:, :, np.newaxis] * h_factor
     w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+        np.linspace(-1, 1, w)[np.newaxis, :], h,
+        axis=0)[:, :, np.newaxis] * w_factor
     grid = np.concatenate(
         [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
     grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
@@ -45,12 +51,17 @@ class TestAffineGridOp(OpTest):
         theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
         theta = np.ones(self.theta_shape).astype("float32")
         self.inputs = {'Theta': theta}
-        self.attrs = {"use_cudnn": True}
+        self.attrs = {
+            "use_cudnn": self.use_cudnn,
+            "align_corners": self.align_corners
+        }
         if self.dynamic_shape:
             self.inputs['OutputShape'] = self.output_shape
         else:
             self.attrs['output_shape'] = self.output_shape
-        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
+        self.outputs = {
+            'Output': AffineGrid(theta, self.output_shape, self.align_corners)
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -62,6 +73,8 @@ class TestAffineGridOp(OpTest):
         self.theta_shape = (17, 2, 3)
         self.output_shape = np.array([17, 2, 5, 7]).astype("int32")
         self.dynamic_shape = False
+        self.use_cudnn = False
+        self.align_corners = True
 
 
 class TestAffineGridOpCase1(TestAffineGridOp):
@@ -69,6 +82,35 @@ class TestAffineGridOpCase1(TestAffineGridOp):
         self.theta_shape = (20, 2, 3)
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
         self.dynamic_shape = True
+        self.use_cudnn = True
+        self.align_corners = True
+
+
+class TestAffineGridOpCase2(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (20, 2, 3)
+        self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+        self.use_cudnn = False
+        self.align_corners = True
+
+
+class TestAffineGridOpCase3(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (20, 2, 3)
+        self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+        self.use_cudnn = False
+        self.align_corners = False
+
+
+class TestAffineGridOpCase4(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (25, 2, 3)
+        self.output_shape = np.array([25, 2, 5, 6]).astype("int32")
+        self.dynamic_shape = False
+        self.use_cudnn = False
+        self.align_corners = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_op.py b/python/paddle/fluid/tests/unittests/test_allclose_op.py
index 5b5ed2641880ade671434185414fa45c26901a2d..dc50e569f80433a5730b1ea33a6f3b4922d99c91 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 
 
 class TestAllcloseOp(OpTest):
@@ -76,5 +77,58 @@ class TestAllcloseOpNanTrue(TestAllcloseOp):
         self.equal_nan = True
 
 
+class TestAllcloseDygraph(unittest.TestCase):
+    def test_api_case(self):
+        paddle.disable_static()
+        x_data = np.random.rand(10, 10)
+        y_data = np.random.rand(10, 10)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.allclose(x, y, rtol=1e-05, atol=1e-08)
+        expected_out = np.allclose(x_data, y_data, rtol=1e-05, atol=1e-08)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+
+class TestAllcloseError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+                result = paddle.allclose(x, y)
+
+        self.assertRaises(TypeError, test_x_dtype)
+
+        def test_y_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                y = paddle.data(name='y', shape=[10, 10], dtype='int32')
+                result = paddle.allclose(x, y)
+
+        self.assertRaises(TypeError, test_y_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+        y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+
+        def test_rtol():
+            result = paddle.allclose(x, y, rtol=True)
+
+        self.assertRaises(TypeError, test_rtol)
+
+        def test_atol():
+            result = paddle.allclose(x, y, rtol=True)
+
+        self.assertRaises(TypeError, test_atol)
+
+        def test_equal_nan():
+            result = paddle.allclose(x, y, equal_nan=1)
+
+        self.assertRaises(TypeError, test_equal_nan)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index 1736e49f3b67b380b88e53ac9876f3ccde53104c..29003d28e441c02e040a8d6cb9888e376521bc72 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import paddle
 from paddle.fluid import core
-from paddle import program_guard, Program
+from paddle.static import program_guard, Program
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -82,7 +82,7 @@ class TestArangeAPI(unittest.TestCase):
 
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             out = exe.run(fetch_list=[x1])
 
         expected_data = np.arange(0, 5, 1).astype(np.float32)
@@ -93,15 +93,16 @@ class TestArangeImperative(unittest.TestCase):
     def test_out(self):
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
-        with paddle.imperative.guard(place):
-            x1 = paddle.arange(0, 5, 1)
-            x2 = paddle.tensor.arange(5)
-            x3 = paddle.tensor.creation.arange(5)
-
-            start = paddle.imperative.to_variable(np.array([0], 'float32'))
-            end = paddle.imperative.to_variable(np.array([5], 'float32'))
-            step = paddle.imperative.to_variable(np.array([1], 'float32'))
-            x4 = paddle.arange(start, end, step, 'int64')
+        paddle.disable_static(place)
+        x1 = paddle.arange(0, 5, 1)
+        x2 = paddle.tensor.arange(5)
+        x3 = paddle.tensor.creation.arange(5)
+
+        start = paddle.to_variable(np.array([0], 'float32'))
+        end = paddle.to_variable(np.array([5], 'float32'))
+        step = paddle.to_variable(np.array([1], 'float32'))
+        x4 = paddle.arange(start, end, step, 'int64')
+        paddle.enable_static()
 
         expected_data = np.arange(0, 5, 1).astype(np.int64)
         for i in [x1, x2, x3, x4]:
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index 0201f0635a5afeb285cdbca3e8d526a1ff5032f2..3639c4dea0a3a12aa46d2875affeebd4c623a4dd 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -201,107 +201,5 @@ class BaseTestComplex2_2(OpTest):
             }
 
 
-class APT_ArgMaxTest(unittest.TestCase):
-    def test_output_result(self):
-        with fluid.program_guard(fluid.Program()):
-            data1 = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            data2 = fluid.data(name="Y", shape=[3], dtype="int64")
-            out = paddle.argmax(input=data1, out=data2)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(
-                feed={"X": np.random.rand(3, 4).astype("float32")},
-                fetch_list=[data2, out])
-            self.assertEqual((result[0] == result[1]).all(), True)
-
-    def test_basic(self):
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=1)
-
-            result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data, axis=0)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=0)
-
-            result = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data, dtype="int32")
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=1).astype(np.int32)
-
-            result = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data1 = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            data2 = fluid.data(name="Y", shape=[3], dtype="int64")
-            out = paddle.argmax(input=data, out=data2)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(
-                feed={"X": np.random.rand(3, 4).astype("float32")},
-                fetch_list=[data2, out])
-        self.assertEqual((result[0] == result[1]).all(), True)
-
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[100], dtype="float32")
-            y_1 = paddle.argmax(x, name='arg_max_res')
-            self.assertEqual(('arg_max_res' in y_1.name), True)
-
-    def test_errors(self):
-        def test_dtype1():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.argmax(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_dtype1)
-
-        def test_dtype2():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float64")
-                paddle.argmax(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_dtype2)
-
-
-class TestArgMinMaxOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            def test_argmax_x_type():
-                x1 = [1, 2, 3]
-                output = fluid.layers.argmax(x=x1)
-
-            self.assertRaises(TypeError, test_argmax_x_type)
-
-            def test_argmin_x_type():
-                x2 = [1, 2, 3]
-                output = fluid.layers.argmin(x=x2)
-
-            self.assertRaises(TypeError, test_argmin_x_type)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c1f9d802c31ac2c3b244541936ba25018e1487a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+
+
+def create_kernel_case(op_type, numpy_op_type):
+    class ArgMinMaxKernelBaseCase(OpTest):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 0
+
+        def setUp(self):
+            np.random.seed(123)
+            self.initTestCase()
+            self.dims = (4, 5, 6)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {'Out': self.numpy_op(self.x, axis=self.axis)}
+
+        def test_check_output(self):
+            paddle.enable_static()
+            self.check_output()
+
+    class ArgMinMaxKernelCase0(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 1
+
+    class ArgMinMaxKernelCase1(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 2
+
+    class ArgMinMaxKernelCase2(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = -1
+
+    class ArgMinMaxKernelCase3(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = -2
+
+    class ArgMinMaxKernelCase4(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4, 5, 6)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "keepdims": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out': self.numpy_op(
+                    self.x, axis=self.axis).reshape((1, 5, 6))
+            }
+
+    class ArgMinMaxKernelCase5(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "flatten": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out': self.numpy_op(
+                    self.x.flatten(), axis=self.axis)
+            }
+
+    class ArgMinMaxKernelCase6(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "flatten": True, "keepdims": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out':
+                np.array(self.numpy_op(
+                    self.x.flatten(), axis=self.axis))
+            }
+
+    cls_name = "ArgMinMaxKernelBaseCase_%s" % (op_type)
+    ArgMinMaxKernelBaseCase.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelBaseCase
+
+    cls_name = "ArgMinMaxKernelCase0_%s" % (op_type)
+    ArgMinMaxKernelCase0.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase0
+
+    cls_name = "ArgMinMaxKernelCase1_%s" % (op_type)
+    ArgMinMaxKernelCase1.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase1
+
+    cls_name = "ArgMinMaxKernelCase2_%s" % (op_type)
+    ArgMinMaxKernelCase2.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase2
+
+    cls_name = "ArgMinMaxKernelCase3_%s" % (op_type)
+    ArgMinMaxKernelCase3.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase3
+
+    cls_name = "ArgMinMaxKernelCase4_%s" % (op_type)
+    ArgMinMaxKernelCase4.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase4
+
+    cls_name = "ArgMinMaxKernelCase5_%s" % (op_type)
+    ArgMinMaxKernelCase5.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase5
+
+    cls_name = "ArgMinMaxKernelCase6_%s" % (op_type)
+    ArgMinMaxKernelCase6.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase6
+
+
+for op_type, numpy_op_type in zip(['arg_max', 'arg_min'], ['argmax', 'argmin']):
+    create_kernel_case(op_type, numpy_op_type)
+
+
+def create_test_case(op_type):
+    class ArgMaxMinTestCase(unittest.TestCase):
+        def setUp(self):
+            np.random.seed(123)
+            self.input_data = np.random.rand(10, 10).astype("float32")
+            self.places = []
+            self.places.append(fluid.CPUPlace())
+            if core.is_compiled_with_cuda():
+                self.places.append(paddle.CUDAPlace(0))
+            self.op = eval("paddle.%s" % (op_type))
+            self.numpy_op = eval("np.%s" % (op_type))
+
+        def run_static(self, place):
+            paddle.enable_static()
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data)
+                self.assertTrue((result_data == np.array(expected_data)).all(),
+                                True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=1)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data, axis=1)
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=-1)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data, axis=-1)
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=-1, keepdim=True)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(
+                    self.input_data, axis=-1).reshape((10, 1))
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                op = eval("paddle.%s" % (op_type))
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                result = op(data_var, axis=-1, name="test_arg_api")
+                self.assertTrue("test_arg_api" in result.name)
+
+        def run_dygraph(self, place):
+            paddle.disable_static()
+            op = eval("paddle.%s" % (op_type))
+            data_tensor = paddle.to_tensor(self.input_data)
+
+            #case 1 
+            result_data = op(data_tensor)
+            excepted_data = self.numpy_op(self.input_data)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 2 
+            result_data = op(data_tensor, axis=1)
+            excepted_data = self.numpy_op(self.input_data, axis=1)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 3 
+            result_data = op(data_tensor, axis=-1)
+            excepted_data = self.numpy_op(self.input_data, axis=-1)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 4 
+            result_data = op(data_tensor, axis=-1, keepdim=True)
+            excepted_data = self.numpy_op(self.input_data, axis=-1)
+            excepted_data = excepted_data.reshape((10))
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 5 
+            result_data = op(data_tensor, axis=-1, keepdim=True, dtype="int32")
+            self.assertTrue(result_data.numpy().dtype == np.int32)
+
+            # case for dim 4, 5, 6, for test case coverage
+            input_data = np.random.rand(5, 5, 5, 5)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            input_data = np.random.rand(4, 4, 4, 4, 4)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            input_data = np.random.rand(3, 3, 3, 3, 3, 3)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+        def test_case(self):
+            for place in self.places:
+                self.run_static(place)
+                self.run_dygraph(place)
+
+    cls_name = "ArgMaxMinTestCase_{}".format(op_type)
+    ArgMaxMinTestCase.__name__ = cls_name
+    globals()[cls_name] = ArgMaxMinTestCase
+
+
+for op_type in ['argmin', 'argmax']:
+    create_test_case(op_type)
+
+
+class TestArgMinMaxOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+
+            def test_argmax_x_type():
+                x1 = [1, 2, 3]
+                output = paddle.argmax(x=x1)
+
+            self.assertRaises(TypeError, test_argmax_x_type)
+
+            def test_argmin_x_type():
+                x2 = [1, 2, 3]
+                output = paddle.argmin(x=x2)
+
+            self.assertRaises(TypeError, test_argmin_x_type)
+
+            def test_argmax_attr_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, dtype="float32")
+
+            self.assertRaises(ValueError, test_argmax_attr_type)
+
+            def test_argmin_attr_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, dtype="float32")
+
+            self.assertRaises(ValueError, test_argmin_attr_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index eb19c8fd6b45cab65e9c9bced189478098bdb66c..2a8e0e6c7f0bcf4a779b4c098cd4af816e976205 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import paddle
 import paddle.fluid as fluid
-import paddle.imperative as imperative
 import paddle.fluid.layers as layers
 import numpy as np
 import six
@@ -384,20 +383,21 @@ class TestArgsortDygraph(unittest.TestCase):
             self.place = core.CPUPlace()
 
     def test_api_0(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.argsort(var_x)
-            self.assertEqual((np.argsort(self.input_data) == out.numpy()).all(),
-                             True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.argsort(var_x)
+        self.assertEqual((np.argsort(self.input_data) == out.numpy()).all(),
+                         True)
+        paddle.enable_static()
 
     def test_api_1(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.argsort(var_x, axis=-1)
-            self.assertEqual(
-                (np.argsort(
-                    self.input_data, axis=-1) == out.numpy()).all(),
-                True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.argsort(var_x, axis=-1)
+        self.assertEqual(
+            (np.argsort(
+                self.input_data, axis=-1) == out.numpy()).all(), True)
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index dd373f523a0b20b3ee30b56907828852fbcc0ee6..fd009db5fd00133c5bad7c8c52662002ebd03fa8 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
 from paddle.fluid.framework import program_guard
@@ -60,6 +59,10 @@ class AutoCheckPointACLBase(AutoCheckpointBase):
         os.environ.clear()
         os.environ.update(self._old_environ)
 
+        file_name = os.path.basename(__file__)
+        base_name = os.path.splitext(file_name)[0]
+        print("runnng name:", base_name)
+
     def _run_normal(self):
         exe, main_prog, startup_prog = self._generate()
 
@@ -183,6 +186,20 @@ class AutoCheckPointACLBase(AutoCheckpointBase):
         fs.delete(save_dir)
         logger.info("begin _run_load_0")
 
+    def _test_corner_epoch_no(self, break_epoch_no):
+        logger.info("begin test_corener_epoch_no")
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+        self._run_save_0(break_epoch_no=break_epoch_no)
+        self._reset_generator()
+        self._run_load_0(break_epoch_no=break_epoch_no)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        logger.info("end test_corener_epoch_no")
+
 
 class AutoCheckpointTest(AutoCheckPointACLBase):
     def setUp(self):
@@ -194,13 +211,13 @@ class AutoCheckpointTest(AutoCheckPointACLBase):
             "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
             "PADDLE_TRAINER_ID": "0",
             "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
-            "PADDLE_JOB_ID": "test_job_auto_1",
+            "PADDLE_JOB_ID": "test_job_auto_0",
             "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
             "PADDLE_EDL_HDFS_NAME": "",
             "PADDLE_EDL_HDFS_UGI": "",
-            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_1",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_0",
             "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
-            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_0",
             "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
         }
         os.environ.update(proc_env)
@@ -247,102 +264,6 @@ class AutoCheckpointTest(AutoCheckPointACLBase):
 
         logger.info("end test_not_use")
 
-    def test_multiple(self):
-        checker = acp._get_checker()
-        fs = HDFSClient(checker.hdfs_home, None)
-        fs.delete(checker.hdfs_checkpoint_path)
-        self._reset_generator()
-
-        logger.info("begin test_multiple")
-        fs = LocalFS()
-        save_dir = "./run_save_0"
-        fs.delete(save_dir)
-
-        exe, main_prog1, startup_prog1 = self._generate()
-        _, main_prog2, startup_prog2 = self._generate()
-
-        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
-            self._init_env(exe, main_prog1, startup_prog1)
-
-        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
-            self._init_env(exe, main_prog2, startup_prog2)
-
-        o = None
-        epochs = []
-        for i in acp.train_epoch_range(3, 0):
-            for data in data_loader1():
-                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])
-
-            for data in data_loader2():
-                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])
-
-            o = acp._get_train_epoch_range()
-            self.assertEqual(len(o._exe_status), 2)
-            print(o._exe_status)
-            epochs.append(i)
-
-        o = acp._get_train_epoch_range()
-        self.assertTrue(o == None, "now train epoch must not exits now")
-        self.assertEqual(i, 2)
-        self.assertEqual(epochs, [0, 1, 2])
-
-        fs.delete(save_dir)
-        logger.info("end test_multiple")
-
-    def test_distributed_basic(self):
-        checker = acp._get_checker()
-        fs = HDFSClient(checker.hdfs_home, None)
-        fs.delete(checker.hdfs_checkpoint_path)
-        self._reset_generator()
-
-        logger.info("begin test_distributed_basic")
-        fs = LocalFS()
-        save_dir = "./run_save_0"
-        fs.delete(save_dir)
-
-        #basic
-        exe, main_prog, startup_prog = self._generate()
-
-        compiled, data_loader, optimizer, loss, image, label = \
-            self._init_env(exe, main_prog, startup_prog, minimize=False)
-
-        #fleet
-        os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
-
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-
-        with fluid.program_guard(main_prog, startup_prog):
-            dist_optimizer = fleet.distributed_optimizer(optimizer)
-            dist_optimizer.minimize(loss)
-
-        exe.run(startup_prog)
-
-        o = None
-        i = 0
-        name = None
-        for i in acp.train_epoch_range(3, 0):
-            o = acp._get_train_epoch_range()
-            name = o.name
-            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))
-
-            for data in data_loader():
-                fetch = exe.run(fleet.main_program,
-                                feed=data,
-                                fetch_list=[loss])
-
-            self.assertEqual(len(o._exe_status), 1)
-
-        o = acp._get_train_epoch_range()
-        assert o == None, "now train epoch must not exits now"
-        self.assertEqual(i, 2)
-
-        fs.delete(save_dir)
-
-        logger.info("end test_distributed_basic")
-
     def test_checker(self):
         os.environ.pop("PADDLE_JOB_ID", None)
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
new file mode 100644
index 0000000000000000000000000000000000000000..55173325f621f7333a7c3ca32a9c55becee72e5a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest1(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_1",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_1",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_1",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
index 281c3d7af1edfc7c1b03b0bf2893d9dd6469a182..5d72fa01008af55a83d7b9a19747a8d96fb74b2b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
 from paddle.fluid.framework import program_guard
@@ -58,19 +57,7 @@ class AutoCheckpointTest2(AutoCheckPointACLBase):
         os.environ.update(proc_env)
 
     def test_corner_epoch_no(self):
-        logger.info("begin test_corener_epoch_no")
-        checker = acp._get_checker()
-        fs = HDFSClient(checker.hdfs_home, None)
-
-        for i in range(3):
-            fs.delete(checker.hdfs_checkpoint_path)
-            self._reset_generator()
-            self._run_save_0(break_epoch_no=i)
-            self._reset_generator()
-            self._run_load_0(break_epoch_no=i)
-
-        fs.delete(checker.hdfs_checkpoint_path)
-        logger.info("end test_corener_epoch_no")
+        self._test_corner_epoch_no(1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5382f7e328ed1afa2d7516cd0d8db2db659aadd7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest3(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_3",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_3",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_3",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..90db9595d92ef602c03fa7dd104484a4f6101a87
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTestDist(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_dist_basic",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_dist_basic",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_dist_basic",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_distributed_basic(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_distributed_basic")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        #basic
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog, minimize=False)
+
+        #fleet
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+        with fluid.program_guard(main_prog, startup_prog):
+            dist_optimizer = fleet.distributed_optimizer(optimizer)
+            dist_optimizer.minimize(loss)
+
+        exe.run(startup_prog)
+
+        o = None
+        i = 0
+        name = None
+        for i in acp.train_epoch_range(3, 0):
+            o = acp._get_train_epoch_range()
+            name = o.name
+            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))
+
+            for data in data_loader():
+                fetch = exe.run(fleet.main_program,
+                                feed=data,
+                                fetch_list=[loss])
+
+            self.assertEqual(len(o._exe_status), 1)
+
+        o = acp._get_train_epoch_range()
+        assert o == None, "now train epoch must not exits now"
+        self.assertEqual(i, 2)
+
+        fs.delete(save_dir)
+
+        logger.info("end test_distributed_basic")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c10cd0e9922859bf3bad2015587fc0a6b2ba5da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTestMul(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_dist_multiple",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_dist_multiple",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_dist_multiple",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_multiple(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_multiple")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        exe, main_prog1, startup_prog1 = self._generate()
+        _, main_prog2, startup_prog2 = self._generate()
+
+        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
+            self._init_env(exe, main_prog1, startup_prog1)
+
+        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
+            self._init_env(exe, main_prog2, startup_prog2)
+
+        o = None
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            for data in data_loader1():
+                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])
+
+            for data in data_loader2():
+                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])
+
+            o = acp._get_train_epoch_range()
+            self.assertEqual(len(o._exe_status), 2)
+            print(o._exe_status)
+            epochs.append(i)
+
+        o = acp._get_train_epoch_range()
+        self.assertTrue(o == None, "now train epoch must not exits now")
+        self.assertEqual(i, 2)
+        self.assertEqual(epochs, [0, 1, 2])
+
+        fs.delete(save_dir)
+        logger.info("end test_multiple")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index bc666c0de5be06be7529bced39071303430c8ace..875f6211a7fbd98463d98dff91d93cc1b431fc86 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -86,6 +86,31 @@ class TestBaseLayer(unittest.TestCase):
             ret = l()
             self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 
+    def test_add_parameter_with_error(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            param = net.create_parameter(shape=[1])
+
+            with self.assertRaises(TypeError):
+                net.add_parameter(10, param)
+
+            with self.assertRaises(KeyError):
+                net.add_parameter("param.name", param)
+
+            with self.assertRaises(KeyError):
+                net.add_parameter("", param)
+
+            with self.assertRaises(KeyError):
+                net.test_param = 10
+                net.add_parameter("test_param", param)
+
+            with self.assertRaises(TypeError):
+                net.add_parameter("no_param", 10)
+
+            load_param = net.create_parameter(shape=[1])
+            net._loaddict_holder[load_param.name] = load_param
+            net.add_parameter("load_param", load_param)
+
 
 class BufferLayer(fluid.Layer):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 7d90bbd0357bcc93cf7a66e99082feeb7e254db4..6ec6fdb59f200ce1dc9b6418b7f11329f85ba5dd 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -17,7 +17,7 @@ from __future__ import division
 import unittest
 
 import paddle.fluid as fluid
-from paddle.io import BatchSampler, Dataset
+from paddle.io import BatchSampler, Dataset, Sampler, SequenceSampler, RandomSampler
 
 
 class RandomDataset(Dataset):
@@ -35,6 +35,72 @@ class RandomDataset(Dataset):
         return self.sample_num
 
 
+class TestSampler(unittest.TestCase):
+    def test_main(self):
+        dataset = RandomDataset(100, 10)
+        sampler = Sampler(dataset)
+        try:
+            iter(sampler)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+
+class TestSequenceSampler(unittest.TestCase):
+    def test_main(self):
+        dataset = RandomDataset(100, 10)
+        sampler = SequenceSampler(dataset)
+        assert len(sampler) == 100
+
+        for i, index in enumerate(iter(sampler)):
+            assert i == index
+
+
+class TestRandomSampler(unittest.TestCase):
+    def test_main(self):
+        dataset = RandomDataset(100, 10)
+        sampler = RandomSampler(dataset)
+        assert len(sampler) == 100
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 100))
+
+    def test_with_num_samples(self):
+        dataset = RandomDataset(100, 10)
+        sampler = RandomSampler(dataset, num_samples=50, replacement=True)
+        assert len(sampler) == 50
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+            assert i >= 0 and i < 100
+
+    def test_with_generator(self):
+        dataset = RandomDataset(100, 10)
+        generator = iter(range(0, 60))
+        sampler = RandomSampler(dataset, generator=generator)
+        assert len(sampler) == 100
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 60))
+
+    def test_with_generator_num_samples(self):
+        dataset = RandomDataset(100, 10)
+        generator = iter(range(0, 60))
+        sampler = RandomSampler(
+            dataset, generator=generator, num_samples=50, replacement=True)
+        assert len(sampler) == 50
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 50))
+
+
 class TestBatchSampler(unittest.TestCase):
     def setUp(self):
         self.num_samples = 1000
@@ -86,16 +152,18 @@ class TestBatchSamplerShuffle(TestBatchSampler):
         self.drop_last = True
 
 
-class TestBatchSamplerWithIndices(TestBatchSampler):
+class TestBatchSamplerWithSampler(TestBatchSampler):
     def init_batch_sampler(self):
+        dataset = RandomDataset(1000, 10)
+        sampler = SequenceSampler(dataset)
         bs = BatchSampler(
-            indices=list(range(self.num_samples)),
+            sampler=sampler,
             batch_size=self.batch_size,
             drop_last=self.drop_last)
         return bs
 
 
-class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
+class TestBatchSamplerWithSamplerDropLast(unittest.TestCase):
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -103,12 +171,22 @@ class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
         self.shuffle = False
         self.drop_last = True
 
+
+class TestBatchSamplerWithSamplerShuffle(unittest.TestCase):
+    def setUp(self):
+        self.num_samples = 1000
+        self.num_classes = 10
+        self.batch_size = 32
+        self.shuffle = True
+        self.drop_last = True
+
     def test_main(self):
         try:
             dataset = RandomDataset(self.num_samples, self.num_classes)
+            sampler = RandomSampler(dataset)
             bs = BatchSampler(
-                dataset=dataset,
-                indices=list(range(self.num_samples)),
+                sampler=sampler,
+                shuffle=self.shuffle,
                 batch_size=self.batch_size,
                 drop_last=self.drop_last)
             self.assertTrue(False)
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 21571e0981065a0a1e2a5db03e91b4df0ea55d9a..a8054295b41c1f6d0008c4f0a9fadb6f04c647fc 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -19,93 +19,189 @@ import unittest
 from op_test import OpTest
 
 
+def test_static_layer(place,
+                      input_np,
+                      label_np,
+                      reduction='mean',
+                      weight_np=None):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            bce_loss = paddle.nn.loss.BCELoss(
+                weight=weight, reduction=reduction)
+        else:
+            bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
+        res = bce_loss(input, label)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog,
+                                feed={"input": input_np,
+                                      "label": label_np}
+                                if weight_np is None else {
+                                    "input": input_np,
+                                    "label": label_np,
+                                    "weight": weight_np
+                                },
+                                fetch_list=[res])
+    return static_result
+
+
+def test_static_functional(place,
+                           input_np,
+                           label_np,
+                           reduction='mean',
+                           weight_np=None):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            res = paddle.nn.functional.binary_cross_entropy(
+                input, label, weight=weight, reduction=reduction)
+        else:
+            res = paddle.nn.functional.binary_cross_entropy(
+                input, label, reduction=reduction)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog,
+                                feed={"input": input_np,
+                                      "label": label_np}
+                                if weight_np is None else {
+                                    "input": input_np,
+                                    "label": label_np,
+                                    "weight": weight_np
+                                },
+                                fetch_list=[res])
+    return static_result
+
+
+def test_dygraph_layer(place,
+                       input_np,
+                       label_np,
+                       reduction='mean',
+                       weight_np=None):
+    paddle.disable_static()
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+        bce_loss = paddle.nn.loss.BCELoss(weight=weight, reduction=reduction)
+    else:
+        bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
+    dy_res = bce_loss(paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def test_dygraph_functional(place,
+                            input_np,
+                            label_np,
+                            reduction='mean',
+                            weight_np=None):
+    paddle.disable_static()
+    input = paddle.to_tensor(input_np)
+    label = paddle.to_tensor(label_np)
+
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+        dy_res = paddle.nn.functional.binary_cross_entropy(
+            input, label, weight=weight, reduction=reduction)
+    else:
+        dy_res = paddle.nn.functional.binary_cross_entropy(
+            input, label, reduction=reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_bceloss(input_np, label_np, reduction='mean', weight_np=None):
+    if weight_np is None:
+        expected = -1 * (label_np * np.log(input_np) +
+                         (1. - label_np) * np.log(1. - input_np))
+    else:
+        expected = -1 * weight_np * (label_np * np.log(input_np) +
+                                     (1. - label_np) * np.log(1. - input_np))
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
 class TestBCELoss(unittest.TestCase):
     def test_BCELoss(self):
-        input_np = np.random.random(size=(20, 30)).astype(np.float64)
-        label_np = np.random.random(size=(20, 30)).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
+        input_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
+        label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
         places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         reductions = ['sum', 'mean', 'none']
         for place in places:
-            for red in reductions:
-                with fluid.program_guard(prog, startup_prog):
-                    input = fluid.data(
-                        name='input', shape=[None, 30], dtype='float64')
-                    label = fluid.data(
-                        name='label', shape=[None, 30], dtype='float64')
-                    bce_loss = paddle.nn.loss.BCELoss(reduction=red)
-                    res = bce_loss(input, label)
-
-                    exe = fluid.Executor(place)
-                    static_result = exe.run(
-                        prog,
-                        feed={"input": input_np,
-                              "label": label_np},
-                        fetch_list=[res])
-
-                with fluid.dygraph.guard():
-                    bce_loss = paddle.nn.loss.BCELoss(reduction=red)
-                    dy_res = bce_loss(
-                        fluid.dygraph.to_variable(input_np),
-                        fluid.dygraph.to_variable(label_np))
-                    dy_result = dy_res.numpy()
-
-                expected = -1 * (label_np * np.log(input_np) +
-                                 (1. - label_np) * np.log(1. - input_np))
-                if red == 'mean':
-                    expected = np.mean(expected)
-                elif red == 'sum':
-                    expected = np.sum(expected)
-                else:
-                    expected = expected
+            for reduction in reductions:
+                static_result = test_static_layer(place, input_np, label_np,
+                                                  reduction)
+                dy_result = test_dygraph_layer(place, input_np, label_np,
+                                               reduction)
+                expected = calc_bceloss(input_np, label_np, reduction)
                 self.assertTrue(np.allclose(static_result, expected))
                 self.assertTrue(np.allclose(static_result, dy_result))
                 self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static_functional(place, input_np,
+                                                           label_np, reduction)
+                dy_functional = test_dygraph_functional(place, input_np,
+                                                        label_np, reduction)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_weight(self):
-        input_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        input_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
         weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[None, 3, 4, 10], dtype='float64')
-            label = fluid.data(
-                name='label', shape=[None, 3, 4, 10], dtype='float64')
-            weight = fluid.data(
-                name='weight', shape=[3, 4, 10], dtype='float64')
-            bce_loss = paddle.nn.loss.BCELoss(weight=weight)
-            res = bce_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(prog,
-                                    feed={
-                                        "input": input_np,
-                                        "label": label_np,
-                                        "weight": weight_np
-                                    },
-                                    fetch_list=[res])
-
-        with fluid.dygraph.guard():
-            bce_loss = paddle.nn.loss.BCELoss(
-                weight=fluid.dygraph.to_variable(weight_np))
-            dy_res = bce_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_res.numpy()
-
-        expected = np.mean(-1 * weight_np *
-                           (label_np * np.log(input_np) +
-                            (1. - label_np) * np.log(1. - input_np)))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+        for reduction in ['sum', 'mean', 'none']:
+            static_result = test_static_layer(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            dy_result = test_dygraph_layer(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            expected = calc_bceloss(
+                input_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            static_functional = test_static_functional(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            dy_functional = test_dygraph_functional(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_functional, expected))
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+            self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCELoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(
+            ValueError, paddle.nn.loss.BCELoss, reduction="unsupport reduction")
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.binary_cross_entropy,
+            input=input,
+            label=label,
+            reduction="unsupport reduction")
+        paddle.enable_static()
 
 
 def bce_loss(input, label):
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba13a6da01c7dbf8b0e854df43f11b19a4ebd4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+def call_bce_layer(logit, label, weight=None, reduction='mean',
+                   pos_weight=None):
+    bce_logit_loss = paddle.nn.loss.BCEWithLogitsLoss(
+        weight=weight, reduction=reduction, pos_weight=pos_weight)
+    res = bce_logit_loss(logit, label)
+    return res
+
+
+def call_bce_functional(logit,
+                        label,
+                        weight=None,
+                        reduction='mean',
+                        pos_weight=None):
+    res = paddle.nn.functional.binary_cross_entropy_with_logits(
+        logit, label, weight=weight, reduction=reduction, pos_weight=pos_weight)
+    return res
+
+
+def test_static(place,
+                logit_np,
+                label_np,
+                weight_np=None,
+                reduction='mean',
+                pos_weight_np=None,
+                functional=False):
+    paddle.enable_static()
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        feed_dict = {"logit": logit_np, "label": label_np}
+
+        pos_weight = None
+        weight = None
+        if pos_weight_np is not None:
+            pos_weight = paddle.data(
+                name='pos_weight', shape=pos_weight_np.shape, dtype='float64')
+            feed_dict["pos_weight"] = pos_weight_np
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            feed_dict["weight"] = weight_np
+        if functional:
+            res = call_bce_functional(logit, label, weight, reduction,
+                                      pos_weight)
+        else:
+            res = call_bce_layer(logit, label, weight, reduction, pos_weight)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 logit_np,
+                 label_np,
+                 weight_np=None,
+                 reduction='mean',
+                 pos_weight_np=None,
+                 functional=False):
+    paddle.disable_static()
+    logit = paddle.to_tensor(logit_np)
+    label = paddle.to_tensor(label_np)
+    weight = None
+    pos_weight = None
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+    if pos_weight_np is not None:
+        pos_weight = paddle.to_tensor(pos_weight_np)
+    if functional:
+        dy_res = call_bce_functional(logit, label, weight, reduction,
+                                     pos_weight)
+    else:
+        dy_res = call_bce_layer(logit, label, weight, reduction, pos_weight)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_bce_with_logits_loss(logit_np,
+                              label_np,
+                              reduction='mean',
+                              weight_np=None,
+                              pos_weight=None):
+    expected = np.maximum(
+        logit_np,
+        0) - logit_np * label_np + np.log(1 + np.exp(-np.abs(logit_np)))
+    if pos_weight is not None:
+        expected = expected * ((pos_weight - 1) * label_np + 1)
+    if weight_np is not None:
+        expected = weight_np * expected
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
+class TestBCEWithLogitsLoss(unittest.TestCase):
+    def test_BCEWithLogitsLoss(self):
+        logit_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
+        label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        for place in places:
+            for reduction in reductions:
+                static_result = test_static(
+                    place, logit_np, label_np, reduction=reduction)
+                dy_result = test_dygraph(
+                    place, logit_np, label_np, reduction=reduction)
+                expected = calc_bce_with_logits_loss(logit_np, label_np,
+                                                     reduction)
+                self.assertTrue(np.allclose(static_result, expected))
+                self.assertTrue(np.allclose(static_result, dy_result))
+                self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static(
+                    place,
+                    logit_np,
+                    label_np,
+                    reduction=reduction,
+                    functional=True)
+                dy_functional = test_dygraph(
+                    place,
+                    logit_np,
+                    label_np,
+                    reduction=reduction,
+                    functional=True)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_weight(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        for reduction in ['sum', 'mean', 'none']:
+            static_result = test_static(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction)
+            dy_result = test_dygraph(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction)
+            expected = calc_bce_with_logits_loss(
+                logit_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            static_functional = test_static(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction,
+                functional=True)
+            dy_functional = test_dygraph(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction,
+                functional=True)
+            self.assertTrue(np.allclose(static_functional, expected))
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+            self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_pos_weight(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        pos_weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        reduction = "mean"
+        static_result = test_static(place, logit_np, label_np, weight_np,
+                                    reduction, pos_weight_np)
+        dy_result = test_dygraph(place, logit_np, label_np, weight_np,
+                                 reduction, pos_weight_np)
+        expected = calc_bce_with_logits_loss(logit_np, label_np, reduction,
+                                             weight_np, pos_weight_np)
+        self.assertTrue(np.allclose(static_result, expected))
+        self.assertTrue(np.allclose(static_result, dy_result))
+        self.assertTrue(np.allclose(dy_result, expected))
+        static_functional = test_static(
+            place,
+            logit_np,
+            label_np,
+            weight_np,
+            reduction,
+            pos_weight_np,
+            functional=True)
+        dy_functional = test_dygraph(
+            place,
+            logit_np,
+            label_np,
+            weight_np,
+            reduction,
+            pos_weight_np,
+            functional=True)
+        self.assertTrue(np.allclose(static_functional, expected))
+        self.assertTrue(np.allclose(static_functional, dy_functional))
+        self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(
+            ValueError,
+            paddle.nn.BCEWithLogitsLoss,
+            reduction="unsupport reduction")
+        logit = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.binary_cross_entropy_with_logits,
+            logit=logit,
+            label=label,
+            reduction="unsupport reduction")
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a29de80426639ab3a9d2b879bb88a461ba2ab4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+from op_test import OpTest
+import numpy as np
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, bins=2)
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.5 * np.ones((2))
+    return hist, prob
+
+
+class TestBernoulliOp(OpTest):
+    def setUp(self):
+        self.op_type = "bernoulli"
+        self.inputs = {"X": np.random.uniform(size=(1000, 784))}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
+
+    def init_attrs(self):
+        self.attrs = {}
+        self.output_hist = output_hist
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        hist, prob = self.output_hist(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestBernoulliApi(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.rand([1024, 1024])
+        out = paddle.bernoulli(x)
+        paddle.enable_static()
+        hist, prob = output_hist(out.numpy())
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+    def test_static(self):
+        x = paddle.rand([1024, 1024])
+        out = paddle.bernoulli(x)
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        out = exe.run(paddle.static.default_main_program(),
+                      fetch_list=[out.name])
+        hist, prob = output_hist(out[0])
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..01daea32167d28edbb46d6854872976aed79494e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import Program, program_guard
+from paddle.nn.functional import interpolate
+
+
+def cubic_1(x, a):
+    return ((a + 2) * x - (a + 3)) * x * x + 1
+
+
+def cubic_2(x, a):
+    return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a
+
+
+def cubic_interp1d(x0, x1, x2, x3, t):
+    param = [0, 0, 0, 0]
+    a = -0.75
+    x_1 = t
+    x_2 = 1.0 - t
+    param[0] = cubic_2(x_1 + 1.0, a)
+    param[1] = cubic_1(x_1, a)
+    param[2] = cubic_1(x_2, a)
+    param[3] = cubic_2(x_2 + 1.0, a)
+    return x0 * param[0] + x1 * param[1] + x2 * param[2] + x3 * param[3]
+
+
+def value_bound(input, w, h, x, y):
+    access_x = int(max(min(x, w - 1), 0))
+    access_y = int(max(min(y, h - 1), 0))
+    return input[:, :, access_y, access_x]
+
+
+def bicubic_interp_np(input,
+                      out_h,
+                      out_w,
+                      out_size=None,
+                      actual_shape=None,
+                      align_corners=True,
+                      data_layout='kNCHW'):
+    """trilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for k in range(out_h):
+        if (align_corners):
+            h = ratio_h * k
+        else:
+            h = ratio_h * (k + 0.5) - 0.5
+        input_y = np.floor(h)
+        y_t = h - input_y
+        for l in range(out_w):
+            if (align_corners):
+                w = ratio_w * l
+            else:
+                w = ratio_w * (l + 0.5) - 0.5
+            input_x = np.floor(w)
+            x_t = w - input_x
+            for i in range(batch_size):
+                for j in range(channel):
+                    coefficients = [0, 0, 0, 0]
+                    for ii in range(4):
+                        access_x_0 = int(max(min(input_x - 1, in_w - 1), 0))
+                        access_x_1 = int(max(min(input_x + 0, in_w - 1), 0))
+                        access_x_2 = int(max(min(input_x + 1, in_w - 1), 0))
+                        access_x_3 = int(max(min(input_x + 2, in_w - 1), 0))
+                        access_y = int(max(min(input_y - 1 + ii, in_h - 1), 0))
+
+                        coefficients[ii] = cubic_interp1d(
+                            input[i, j, access_y, access_x_0],
+                            input[i, j, access_y, access_x_1],
+                            input[i, j, access_y, access_x_2],
+                            input[i, j, access_y, access_x_3], x_t)
+                    out[i, j, k, l] = cubic_interp1d(
+                        coefficients[0], coefficients[1], coefficients[2],
+                        coefficients[3], y_t)
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+    return out.astype(input.dtype)
+
+
+class TestBicubicInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bicubic_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bicubic_interp_np(input_np, out_h, out_w, self.out_size,
+                                      self.actual_shape, self.align_corners,
+                                      self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+
+
+class TestBicubicInterpCase1(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpCase2(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 8
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpCase3(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = False
+
+
+class TestBicubicInterpCase4(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+
+
+class TestBicubicInterpCase5(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 11
+        self.out_w = 11
+        self.scale = 0.
+        self.out_size = np.array([6, 4]).astype("int32")
+        self.align_corners = False
+
+
+class TestBicubicInterpCase6(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0
+        self.out_size = np.array([64, 32]).astype("int32")
+        self.align_corners = False
+
+
+class TestBicubicInterpSame(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpScale(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = [1., 1.]
+        self.align_corners = True
+
+
+class TestBicubicInterpDataLayout(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
+
+
+class TestBicubicInterpOpAPI(unittest.TestCase):
+    def test_case(self):
+        np.random.seed(200)
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        with fluid.program_guard(prog, startup_prog):
+
+            x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+            dim = fluid.data(name="dim", shape=[1], dtype="int32")
+            shape_tensor = fluid.data(
+                name="shape_tensor", shape=[2], dtype="int32")
+            actual_size = fluid.data(
+                name="actual_size", shape=[2], dtype="int32")
+            scale_tensor = fluid.data(
+                name="scale_tensor", shape=[1], dtype="float32")
+
+            out1 = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            out2 = interpolate(
+                x, size=[12, dim], mode='bicubic', align_corners=False)
+            out3 = interpolate(
+                x, size=shape_tensor, mode='bicubic', align_corners=False)
+            out4 = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            out5 = interpolate(
+                x,
+                scale_factor=scale_tensor,
+                mode='bicubic',
+                align_corners=False)
+            out6 = interpolate(
+                x, scale_factor=2.0, mode='bicubic', align_corners=False)
+            out7 = interpolate(
+                x, scale_factor=[2.0, 2.0], mode='bicubic', align_corners=False)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            results = exe.run(
+                fluid.default_main_program(),
+                feed={
+                    "x": x_data,
+                    "dim": dim_data,
+                    "shape_tensor": shape_data,
+                    "actual_size": actual_size_data,
+                    "scale_tensor": scale_data
+                },
+                fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                return_numpy=True)
+
+            expect_res = bicubic_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            for res in results:
+                self.assertTrue(np.allclose(res, expect_res))
+
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            dy_result = interp.numpy()
+            expect = bicubic_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            self.assertTrue(np.allclose(dy_result, expect))
+
+
+class TestBicubicOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of interpoalte must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, interpolate, x1)
+
+            def test_mode_type():
+                # mode must be "BILINEAR" "TRILINEAR" "NEAREST" "BICUBIC"
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+                out = interpolate(
+                    x, size=[12, 12], mode='UNKONWN', align_corners=False)
+
+            def test_input_shape():
+                x = fluid.data(name="x", shape=[2], dtype="float32")
+                out = interpolate(
+                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+
+            def test_align_corcers():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                interpolate(x, size=[12, 12], mode='BICUBIC', align_corners=3)
+
+            def test_out_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size=[12], mode='bicubic', align_corners=False)
+
+            def test_attr_data_format():
+                # for 5-D input, data_format only can be NCDHW or NDHWC
+                input = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    input,
+                    size=[4, 8, 4, 5],
+                    mode='trilinear',
+                    data_format='NHWC')
+
+            def test_actual_shape():
+                # the actual_shape  must be Variable.
+                x = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(
+                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+
+            def test_scale_value():
+                # the scale must be greater than zero.
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='BICUBIC',
+                    align_corners=False,
+                    scale_factor=-2.0)
+
+            def test_attr_5D_input():
+                # for 5-D input, data_format only can be NCDHW or NDHWC
+                input = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    input,
+                    size=[4, 8, 4, 5],
+                    mode='trilinear',
+                    data_format='NDHWC')
+
+            def test_scale_type():
+                # the scale must be greater than zero.
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                scale = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=scale)
+
+            def test_align_mode():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=False,
+                    align_mode=2,
+                    scale_factor=1.0)
+
+            def test_outshape_and_scale():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=None)
+
+            def test_align_corners_and_nearest():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=True,
+                    scale_factor=None)
+
+            def test_scale_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=False,
+                    scale_factor=[1, 2, 2])
+
+            def test_scale_value():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='trilinear',
+                    align_corners=False,
+                    scale_factor=[1, 2, 2])
+
+            self.assertRaises(ValueError, test_mode_type)
+            self.assertRaises(ValueError, test_input_shape)
+            self.assertRaises(TypeError, test_align_corcers)
+            self.assertRaises(ValueError, test_attr_data_format)
+            self.assertRaises(TypeError, test_actual_shape)
+            self.assertRaises(ValueError, test_scale_value)
+            self.assertRaises(ValueError, test_out_shape)
+            self.assertRaises(ValueError, test_attr_5D_input)
+            self.assertRaises(TypeError, test_scale_type)
+            self.assertRaises(ValueError, test_align_mode)
+            self.assertRaises(ValueError, test_outshape_and_scale)
+            self.assertRaises(ValueError, test_align_corners_and_nearest)
+            self.assertRaises(ValueError, test_scale_shape)
+            self.assertRaises(ValueError, test_scale_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_api.py b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eae4797de85f371ed62e78c85b160f698ee9eb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from op_test import OpTest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+
+
+class TestBilinearAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.program_guard(fluid.default_startup_program(),
+                                 fluid.default_main_program()):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+            else:
+                place = core.CPUPlace()
+            exe = fluid.Executor(place)
+
+            data1 = fluid.data(name='X1', shape=[5, 5], dtype='float32')
+            data2 = fluid.data(name='X2', shape=[5, 4], dtype='float32')
+
+            layer1 = np.random.random((5, 5)).astype('float32')
+            layer2 = np.random.random((5, 4)).astype('float32')
+
+            bilinear = paddle.nn.Bilinear(
+                in1_features=5, in2_features=4, out_features=1000)
+            ret = bilinear(data1, data2)
+
+            exe.run(fluid.default_startup_program())
+            ret_fetch = exe.run(feed={'X1': layer1,
+                                      'X2': layer2},
+                                fetch_list=[ret.name])
+            self.assertEqual(ret_fetch[0].shape, (5, 1000))
+
+
+class TestBilinearAPIDygraph(unittest.TestCase):
+    def test_api(self):
+        paddle.disable_static()
+        layer1 = np.random.random((5, 5)).astype('float32')
+        layer2 = np.random.random((5, 4)).astype('float32')
+        bilinear = paddle.nn.Bilinear(
+            in1_features=5, in2_features=4, out_features=1000)
+        ret = bilinear(paddle.to_tensor(layer1), paddle.to_tensor(layer2))
+        self.assertEqual(ret.shape, [5, 1000])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..d139a53c7e2ccc68964457f3142b4ed890d339f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -0,0 +1,620 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+import paddle
+
+
+def bilinear_interp_np(input,
+                       out_h,
+                       out_w,
+                       out_size=None,
+                       actual_shape=None,
+                       align_corners=True,
+                       align_mode=0,
+                       data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for i in range(out_h):
+        if (align_mode == 0 and not align_corners):
+            h = int(ratio_h * (i + 0.5) - 0.5)
+        else:
+            h = int(ratio_h * i)
+
+        h = max(0, h)
+        hid = 1 if h < in_h - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
+            h1lambda = idx_src_h - h
+        else:
+            h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            if (align_mode == 0 and not align_corners):
+                w = int(ratio_w * (j + 0.5) - 0.5)
+            else:
+                w = int(ratio_w * j)
+            w = max(0, w)
+            wid = 1 if w < in_w - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+                w1lambda = idx_src_w - w
+            else:
+                w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestBilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 33]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpSame(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NHWC"
+
+
+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = [1.5, 0.5]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpZero(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+class TestBilinearInterpOpAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
+        out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_bilinear(
+            x, out_shape=[4, 4], actual_shape=actual_size)
+        out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = bilinear_interp_np(
+            x_data, out_h=12, out_w=12, align_corners=True)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestUpsampleBilinear2dInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph
+        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
+        upsample = paddle.nn.UpsamplingBilinear2d(scale_factor=[2, 2])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = upsample(x)
+            expect = bilinear_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=True)
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestBilinearInterpOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x, size=[12, 12], mode="bilinear", align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 5cc8e2ba15d260b988ee66a5711aed42ca04c10b..cc2b1165ec304a63671b48d4702142ea38c9a2c1 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,7 +65,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     """Bipartite Matching algorithm for batch input.
     Arg:
         distance (numpy.array) : The distance of two entries with shape [M, N].
-        lod (list of int): The offsets of each input in this batch.
+        lod (list of int): The length of each input in this batch.
     """
     n = len(lod)
     m = distance.shape[1]
@@ -73,6 +73,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     match_dist = np.zeros((n, m), dtype=np.float32)
     cur_offset = 0
     for i in range(n):
+        if lod[i] == 0: continue
         bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
                         match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
@@ -155,5 +156,22 @@ class TestBipartiteMatchOpWithPerPredictionType(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithEmptyLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[5, 6, 0, 12]]
+        dist = np.random.random((23, 217)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': (dist, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a9d450e223f1e0bc84513f7069e0a104fa644e7c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+# use default values
+# FIXME: random fails on Unknown command lines -c (or -m).
+launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
+CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index 4e2280c0118a11ebfc21f6179b8a7a795c6f53da..ab08a0aacbf08768ffff43974ee9a7c7dd4a7288 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -90,5 +90,55 @@ class TestCholeskyOp2D(TestCholeskyOp):
         self._input_shape = (64, 64)
 
 
+class TestDygraph(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        a = np.random.rand(3, 3)
+        a_t = np.transpose(a, [1, 0])
+        x_data = np.matmul(a, a_t) + 1e-03
+        x = paddle.to_variable(x_data)
+        out = paddle.cholesky(x, upper=False)
+
+
+class TestCholeskySingularAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place, with_out=False):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[4, 4], dtype="float64")
+            result = paddle.cholesky(input)
+
+            input_np = np.zeros([4, 4]).astype("float64")
+
+            exe = fluid.Executor(place)
+            try:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": input_np},
+                                  fetch_list=[result])
+            except fluid.core.EnforceNotMet as ex:
+                print("The mat is singular")
+                pass
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                     [[10, 11, 12], [13, 14, 15],
+                                      [16, 17, 18]]]).astype("float64")
+                input = fluid.dygraph.to_variable(input_np)
+                try:
+                    result = paddle.cholesky(input)
+                except fluid.core.EnforceNotMet as ex:
+                    print("The mat is singular")
+                    pass
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_op.py b/python/paddle/fluid/tests/unittests/test_chunk_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..043b326fbd98769f96688ef2eeaf23c53978c94d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_chunk_op.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import numpy as np
+from paddle.fluid import Program, program_guard
+from paddle import fluid
+import paddle
+
+
+class TestChunkOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The type of axis in chunk_op should be int or Variable.
+            def test_axis_type():
+                x1 = paddle.data(shape=[4], dtype='float16', name='x3')
+                paddle.chunk(x=x1, chunks=2, axis=3.2)
+
+            self.assertRaises(TypeError, test_axis_type)
+
+            # The type of axis in chunk op should be int or Variable.
+            def test_axis_variable_type():
+                x2 = paddle.data(shape=[4], dtype='float16', name='x9')
+                x3 = paddle.data(shape=[1], dtype='float16', name='x10')
+                paddle.chunk(input=x2, chunks=2, axis=x3)
+
+            self.assertRaises(TypeError, test_axis_variable_type)
+
+            # The type of num_or_sections in chunk_op should be int, tuple or list.
+            def test_chunks_type():
+                x4 = paddle.data(shape=[4], dtype='float16', name='x4')
+                paddle.chunk(input=x4, chunks=2.1, axis=3)
+
+            self.assertRaises(TypeError, test_chunks_type)
+
+            def test_axis_type_tensor():
+                x5 = paddle.data(shape=[4], dtype='float16', name='x6')
+                paddle.chunk(input=x5, chunks=2, axis=3.2)
+
+            self.assertRaises(TypeError, test_axis_type_tensor)
+
+
+class API_TestChunk(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = paddle.data('data1', shape=[4, 6, 6], dtype='float64')
+            data2 = paddle.data('data2', shape=[1], dtype='int32')
+            x0, x1, x2 = paddle.chunk(data1, chunks=3, axis=data2)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            input1 = np.random.random([4, 6, 6]).astype('float64')
+            input2 = np.array([2]).astype('int32')
+            r0, r1, r2, = exe.run(feed={"data1": input1,
+                                        "data2": input2},
+                                  fetch_list=[x0, x1, x2])
+            ex_x0, ex_x1, ex_x2 = np.array_split(input1, 3, axis=2)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+            self.assertTrue(np.allclose(ex_x2, r2))
+
+
+class API_TestChunk1(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = paddle.data('data1', shape=[4, 6, 6], dtype='float64')
+            x0, x1, x2 = paddle.chunk(data1, chunks=3, axis=2)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            input1 = np.random.random([4, 6, 6]).astype('float64')
+            r0, r1, r2, = exe.run(feed={"data1": input1},
+                                  fetch_list=[x0, x1, x2])
+            ex_x0, ex_x1, ex_x2 = np.array_split(input1, 3, axis=2)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+            self.assertTrue(np.allclose(ex_x2, r2))
+
+
+class API_TestDygraphChunk(unittest.TestCase):
+    def test_out1(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.chunk(input, chunks=3, axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.array_split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+    def test_out2(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([4, 6, 6]).astype("bool")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.chunk(input, chunks=3, axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.array_split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+    def test_axis_tensor_input(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            num1 = paddle.full(shape=[1], fill_value=1, dtype='int32')
+            x0, x1, x2 = paddle.chunk(input, chunks=3, axis=num1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.array_split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clamp.py b/python/paddle/fluid/tests/unittests/test_clamp.py
deleted file mode 100644
index d8d7fe01f8de8686724ea8ebc00491269f2cc0bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_clamp.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.tensor as tensor
-import paddle.fluid as fluid
-import numpy as np
-import unittest
-
-
-class TestClampAPI(unittest.TestCase):
-    def test_dygraph_clamp(self):
-        in1 = np.array([[1.2, 3.5], [4.5, 6.4]]).astype('float32')
-        with fluid.dygraph.guard():
-            x1 = fluid.dygraph.to_variable(in1)
-            out1 = tensor.clamp(x1, min=3.5, max=5.0)
-            out2 = tensor.clamp(x1, min=2.5)
-            self.assertTrue(
-                np.allclose(
-                    out1.numpy(), in1.clip(
-                        min=3.5, max=5.0)))
-            self.assertTrue(np.allclose(out2.numpy(), in1.clip(min=2.5)))
-
-    def test_clamp(self):
-        data_shape = [1, 9, 9, 4]
-        data = np.random.random(data_shape).astype('float32')
-        images = fluid.data(name='image', shape=data_shape, dtype='float32')
-        min = fluid.data(name='min', shape=[1], dtype='float32')
-        max = fluid.data(name='max', shape=[1], dtype='float32')
-
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        out_1 = tensor.clamp(images, min=min, max=max)
-        out_2 = tensor.clamp(images, min=0.2, max=0.9)
-        out_3 = tensor.clamp(images, min=0.3)
-        out_4 = tensor.clamp(images, max=0.7)
-        out_5 = tensor.clamp(images, min=min)
-        out_6 = tensor.clamp(images, max=max)
-
-        res1, res2, res3, res4, res5, res6 = exe.run(
-            fluid.default_main_program(),
-            feed={
-                "image": data,
-                "min": np.array([0.2]).astype('float32'),
-                "max": np.array([0.8]).astype('float32')
-            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
-
-        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
-        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
-        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
-        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
-        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
-        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
-
-
-class TestClampError(unittest.TestCase):
-    def test_errors(self):
-        x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
-        x2 = fluid.layers.data(name='x2', shape=[1], dtype="int8")
-        self.assertRaises(TypeError, tensor.clamp, x=x1, min=0.2, max=0.8)
-        self.assertRaises(TypeError, tensor.clamp, x=x2, min=0.2, max=0.8)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 33bbd4c8830d689bed513b9ce4084c3d00a923a8..74c01e1424885051faf3e263e6ca26c1269a838e 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from op_test import OpTest
@@ -92,6 +93,13 @@ class TestCase4(TestClipOp):
         self.inputs['Min'] = np.array([0.3]).astype('float32')
 
 
+class TestCase5(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.5
+        self.min = 0.5
+
+
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -109,5 +117,69 @@ class TestClipOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestClipAPI(unittest.TestCase):
+    def test_clip(self):
+        paddle.enable_static()
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = fluid.data(name='image', shape=data_shape, dtype='float32')
+        min = fluid.data(name='min', shape=[1], dtype='float32')
+        max = fluid.data(name='max', shape=[1], dtype='float32')
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        out_1 = paddle.clip(images, min=min, max=max)
+        out_2 = paddle.clip(images, min=0.2, max=0.9)
+        out_3 = paddle.clip(images, min=0.3)
+        out_4 = paddle.clip(images, max=0.7)
+        out_5 = paddle.clip(images, min=min)
+        out_6 = paddle.clip(images, max=max)
+        out_7 = paddle.clip(images, max=-1.)
+        out_8 = paddle.clip(images)
+
+        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "image": data,
+                "min": np.array([0.2]).astype('float32'),
+                "max": np.array([0.8]).astype('float32')
+            },
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+
+        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
+        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
+        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
+        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
+        self.assertTrue(np.allclose(res7, data.clip(max=-1)))
+        self.assertTrue(np.allclose(res8, data))
+
+    def test_clip_dygraph(self):
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        paddle.disable_static(place)
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = paddle.to_variable(data, dtype='float32')
+
+        out_1 = paddle.clip(images, min=0.2, max=0.8)
+        out_2 = paddle.clip(images, min=0.2, max=0.9)
+
+        self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+
+    def test_errors(self):
+        paddle.enable_static()
+        x1 = fluid.data(name='x1', shape=[1], dtype="int16")
+        x2 = fluid.data(name='x2', shape=[1], dtype="int8")
+        self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
+        self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..71777df4651ea26c7cf5dfc7231018288c2670e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_nccl(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "nccl")
+
+    def test_allgather_gloo(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "gloo", "3")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24dd7cacff6adc56eb059a7bec016a1d3e322825
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_nccl(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "nccl")
+
+    def test_allreduce_gloo(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "gloo", "2")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..437b8b7befae470ab438cabc40817996cda5c938
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import time
+import argparse
+import os
+import six
+import sys
+import subprocess
+import traceback
+import functools
+import pickle
+from contextlib import closing
+from six import string_types
+import paddle.fluid as fluid
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def wait_server_ready(self, endpoints):
+        assert not isinstance(endpoints, string_types)
+        while True:
+            all_ok = True
+            not_ready_endpoints = []
+            for ep in endpoints:
+                ip_port = ep.split(":")
+                with closing(
+                        socket.socket(socket.AF_INET,
+                                      socket.SOCK_STREAM)) as sock:
+                    sock.settimeout(2)
+                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                    if result != 0:
+                        all_ok = False
+                        not_ready_endpoints.append(ep)
+            if not all_ok:
+                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+                sys.stderr.write("not ready endpoints:" + str(
+                    not_ready_endpoints) + "\n")
+                sys.stderr.flush()
+                time.sleep(3)
+            else:
+                break
+
+    def initCommunicator(self, program, rank, nranks, wait_port,
+                         current_endpoint, endpoints):
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            self.wait_server_ready(other_endpoints)
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=nameGen.generate('nccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id
+            })
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        result = self.get_model(train_prog, startup_prog, rank)
+        if args['backend'] == 'nccl':
+            self.initCommunicator(startup_prog, rank, nranks, True,
+                                  current_endpoint, endpoints)
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(
+                device_id)  #if args.use_gpu else fluid.CPUPlace()
+        else:
+            strategy = fluid.core.GlooParallelStrategy()
+            strategy.rank = rank
+            strategy.rank_num = nranks
+            strategy.prefix = ""
+            strategy.iface = "lo"
+            strategy.init_seconds = 999999
+            strategy.run_seconds = 999999
+            strategy.path = "/tmp/tmp%d" % args['path_id']
+            gloo = fluid.core.GlooParallelContext(strategy)
+            gloo.init()
+            place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        np.random.seed(os.getpid())
+        indata = np.random.random((10, 1000))
+        fetch_list = []
+        for elem in result:
+            fetch_list.append(elem.name)
+        out = exe.run(train_prog,
+                      feed={'tindata': indata},
+                      fetch_list=fetch_list)
+        if six.PY2:
+            print(pickle.dumps(out))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_gpus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_gpus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         backend="nccl",
+                         path_id="0",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "GLOG_v": "0",
+            "NCCL_P2P_DISABLE": "1",
+            "BACKEND": backend,
+            "PATH_ID": path_id
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000))
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000))
+        if col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
+        elif col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "reduce":
+            need_result = input1 + input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+        elif col_type == "scatter":
+            need_result = input2
+            need_result1 = need_result[0:need_result.shape[0] // 2]
+            need_result2 = need_result[need_result.shape[0] // 2:]
+            self.assertTrue(np.allclose(tr0_out, need_result1))
+            self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf86f6ae14f1ecbdb3711378c84a3c1ce4967fb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveBarrierAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_barrier_nccl(self):
+        self.check_with_place("collective_barrier_api.py", "barrier", "nccl")
+
+    def test_barrier_gloo(self):
+        self.check_with_place("collective_barrier_api.py", "barrier", "gloo",
+                              "5")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index 3f3a5642abc242e994844d0aac1b79cbf664e4d4..512b2967e02fd01e67f416c2fd9222ae8589d8d8 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -241,6 +241,15 @@ class TestDistBase(unittest.TestCase):
             need_result = input2
             self.assertTrue(np.allclose(tr0_out, need_result))
             self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "reduce":
+            need_result = input1 + input2
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "scatter":
+            need_result = input2
+            need_result1 = need_result[0:need_result.shape[0] // 2]
+            need_result2 = need_result[need_result.shape[0] // 2:]
+            self.assertTrue(np.allclose(tr0_out, need_result1))
+            self.assertTrue(np.allclose(tr1_out, need_result2))
         elif col_type == "allreduce":
             need_result = input1 + input2
             self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf4f1ac4c822ad578f5ee0e0268324de5e5e25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_nccl(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "nccl")
+
+    def test_broadcast_gloo(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "gloo", "0")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..36837d6a227febd02e6ef1e2aeb905de19ca8acc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_base import TestDistBase
+
+
+class TestCReduceOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_reduce(self):
+        self.check_with_place("collective_reduce_op.py", "reduce")
+
+    def test_reduce_calc_stream(self):
+        self.check_with_place("collective_reduce_op_calc_stream.py", "reduce")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3975f3fc1c6959ffbb28a51543ebfef00c52e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveReduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_reduce_nccl(self):
+        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_gloo(self):
+        self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe3ce73359559c0f9b4e0e3990032ce693aab8a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_base import TestDistBase
+
+
+class TestCScatterOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_scatter(self):
+        self.check_with_place("collective_scatter_op.py", "scatter")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cae842b396111f004b7ce52ce3f40c20ebe57263
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveScatterAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_scatter_gloo(self):
+        self.check_with_place("collective_scatter_api.py", "scatter", "gloo",
+                              "4")
+
+    def test_scatter_nccl(self):
+        self.check_with_place("collective_scatter_api.py", "scatter", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 46cae41f3045486837e33722b6c75f91859b65ba..30207340a27db0c1d00ab982cbac716e4b639c7e 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -25,33 +25,45 @@ import numpy
 import paddle
 import paddle.fluid as fluid
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
 
 
 class TestCommunicatorGeoEnd2End(unittest.TestCase):
     def net(self):
         x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x1 = fluid.layers.data(name='x1', shape=[1], dtype='int64', lod_level=1)
+
+        emb = fluid.layers.embedding(
+            input=x1,
+            size=[10000, 10],
+            param_attr=fluid.ParamAttr(
+                name="embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+
+        pool = fluid.layers.sequence_pool(input=emb, pool_type="sum")
+        z = fluid.layers.concat(input=[x, pool], axis=1)
+        y_predict = fluid.layers.fc(input=z, size=1, act=None)
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
         avg_cost = fluid.layers.mean(cost)
-        return avg_cost, x, y
+        return avg_cost, x, x1, y
 
     def fake_reader(self):
         def reader():
             for i in range(10000):
                 x = numpy.random.random((1, 13)).astype('float32')
+                z = numpy.random.randint(0, 9999, (1, 1)).astype('int64')
                 y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
-                yield x, y
+                yield x, z, y
 
         return reader
 
     def run_pserver(self, role, strategy):
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = fluid.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
@@ -64,33 +76,41 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         exe = fluid.Executor(place)
 
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = fluid.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
-        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])
 
         for batch_id, data in enumerate(train_reader()):
-            exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[])
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 
         fleet.stop_worker()
 
     def run_ut(self):
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER
-            if training_role == "TRAINER" else role_maker.Role.SERVER,
-            worker_num=1,
-            server_endpoints=["127.0.0.1:18099"])
+        os.environ["PADDLE_PSERVER_NUMS"] = "1"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001"
+
+        role = role_maker.PaddleCloudRoleMaker()
 
-        strategy = StrategyFactory.create_geo_strategy(10)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
 
         if training_role == "TRAINER":
             self.run_trainer(role, strategy)
@@ -116,8 +136,7 @@ import paddle.fluid as fluid
 from paddle.fluid.communicator import Communicator
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet as fleet
 
 from test_communicator_geo import TestCommunicatorGeoEnd2End
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index be1f32fb0aee10fa08f6609eeda06579145aeb26..c0044d9d620796057cce0e3a51b2dec2878a0e17 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -16,16 +16,13 @@ from __future__ import print_function
 
 import unittest
 import time
-import threading
-import numpy
 
+import os
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.communicator import Communicator
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
 
 
 class TestCommunicator(unittest.TestCase):
@@ -39,19 +36,24 @@ class TestCommunicator(unittest.TestCase):
         return avg_cost
 
     def test_communicator_sync(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=2,
-            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
 
-        fleet.init(role)
+        fleet.init(role_maker.PaddleCloudRoleMaker())
         avg_cost = self.net()
 
         optimizer = fluid.optimizer.SGD(0.01)
 
-        strategy = StrategyFactory.create_sync_strategy()
-        strategy._program_config.wait_port = False
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index a97f54d6cac1ea91f05cb3dc68729f5b68df7c9e..cfad50409802d4f3d35c9da3b22597c681da91b1 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -93,11 +93,12 @@ def create_paddle_case(op_type, callback):
 
         def test_broadcast_api_1(self):
             with program_guard(Program(), Program()):
-                x = paddle.nn.data(name='x', shape=[1, 2, 1, 3], dtype='int32')
-                y = paddle.nn.data(name='y', shape=[1, 2, 3], dtype='int32')
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
-                exe = paddle.Executor(self.place)
+                exe = paddle.static.Executor(self.place)
                 input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
                 input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
                 real_result = callback(input_x, input_y)
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 48b597ab282351739fcca894aa69685a13a9688f..b4dbba7eead397c46c37a8df013dabb00177f030 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -268,9 +268,9 @@ class TestConcatAPI(unittest.TestCase):
         out_3 = paddle.concat(x=[x_2, x_3], axis=positive_1_int64)
         out_4 = paddle.concat(x=[x_2, x_3], axis=negative_int64)
 
-        exe = paddle.Executor(place=paddle.CPUPlace())
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
         [res_1, res_2, res_3, res_4] = exe.run(
-            paddle.default_main_program(),
+            paddle.static.default_main_program(),
             feed={"x_1": input_2,
                   "x_2": input_2,
                   "x_3": input_3},
@@ -284,14 +284,15 @@ class TestConcatAPI(unittest.TestCase):
         in1 = np.array([[1, 2, 3], [4, 5, 6]])
         in2 = np.array([[11, 12, 13], [14, 15, 16]])
         in3 = np.array([[21, 22], [23, 24]])
-        with paddle.imperative.guard():
-            x1 = paddle.imperative.to_variable(in1)
-            x2 = paddle.imperative.to_variable(in2)
-            x3 = paddle.imperative.to_variable(in3)
-            out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
-            out2 = paddle.concat(x=[x1, x2], axis=0)
-            np_out1 = np.concatenate([in1, in2, in3], axis=-1)
-            np_out2 = np.concatenate([in1, in2], axis=0)
+        paddle.disable_static()
+        x1 = paddle.to_variable(in1)
+        x2 = paddle.to_variable(in2)
+        x3 = paddle.to_variable(in3)
+        out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
+        out2 = paddle.concat(x=[x1, x2], axis=0)
+        np_out1 = np.concatenate([in1, in2, in3], axis=-1)
+        np_out2 = np.concatenate([in1, in2], axis=0)
+        paddle.enable_static()
         self.assertEqual((out1.numpy() == np_out1).all(), True)
         self.assertEqual((out2.numpy() == np_out2).all(), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..da527b26bf0608da5a648d92b492ff27cf2802f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class Conv1dTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 spartial_shape=(16, ),
+                 num_channels=6,
+                 num_filters=8,
+                 filter_size=3,
+                 padding=0,
+                 padding_mode="zeros",
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 no_bias=False,
+                 dtype="float32",
+                 data_format="NCL"):
+        super(Conv1dTestCase, self).__init__(methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.spartial_shape = spartial_shape
+        self.filter_size = filter_size
+        self.data_format = data_format
+        self.channel_last = (self.data_format == "NHWC")
+
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.dtype = dtype
+
+    def setUp(self):
+        input_shape = (self.batch_size, self.num_channels
+                       ) + self.spartial_shape if not self.channel_last else (
+                           self.batch_size, ) + self.spartial_shape + (
+                               self.num_channels, )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size]
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (self.num_filters, self.num_channels
+                                            // self.groups) + tuple(filter_size)
+        self.weight = np.random.uniform(
+            -1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(
+                -1, 1, size=(self.num_filters, )).astype(self.dtype)
+        else:
+            self.bias = None
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                input_shape = (-1, self.num_channels,
+                               -1) if not self.channel_last else (
+                                   -1, -1, self.num_channels)
+                x_var = fluid.data("input", input_shape, dtype=self.dtype)
+                w_var = fluid.data(
+                    "weight", self.weight_shape, dtype=self.dtype)
+                b_var = fluid.data(
+                    "bias", (self.num_filters, ), dtype=self.dtype)
+                y_var = F.conv1d(
+                    x_var,
+                    w_var,
+                    b_var if not self.no_bias else None,
+                    padding=self.padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        conv = nn.Conv1d(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            padding_mode=self.padding_mode,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format)
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.functional(place)
+        with dg.guard(place):
+            result2 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class Conv1dErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
+    suite.addTest(Conv1dTestCase(methodName='runTest', stride=2, dilation=(1)))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', padding="same", no_bias=True))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', filter_size=3, padding='valid'))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', padding=2, data_format='NLC'))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
+    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', groups=2, padding="valid"))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest',
+            num_filters=6,
+            num_channels=3,
+            groups=3,
+            padding="valid",
+            data_format='NLC'))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding_mode="reflect", padding="valid"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', data_format="VALID"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', num_filters=8, num_channels=15, groups=3))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding=[1, 2, 3, 4, 5]))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..73227dd3610376d85fcfc70bb2653dfd927427fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class ConvTranspose1dTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 spartial_shape=16,
+                 in_channels=6,
+                 out_channels=8,
+                 filter_size=3,
+                 output_size=None,
+                 padding=0,
+                 output_padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 no_bias=False,
+                 data_format="NCL",
+                 dtype="float32"):
+        super(ConvTranspose1dTestCase, self).__init__(methodName)
+        self.batch_size = batch_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spartial_shape = spartial_shape
+        self.filter_size = filter_size
+        self.output_size = output_size
+
+        self.padding = padding
+        self.output_padding = output_padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.data_format = data_format
+        self.dtype = dtype
+
+    def setUp(self):
+
+        self.channel_last = False if self.data_format == "NCL" else True
+        input_shape = (self.batch_size, self.in_channels,
+                       self.spartial_shape) if not self.channel_last else (
+                           self.batch_size,
+                           self.spartial_shape,
+                           self.in_channels, )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size]
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (self.in_channels, self.out_channels
+                                            // self.groups) + tuple(filter_size)
+        self.weight = np.random.uniform(
+            -1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(
+                -1, 1, size=(self.out_channels, )).astype(self.dtype)
+        else:
+            self.bias = None
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                input_shape = (-1, self.in_channels,
+                               -1) if not self.channel_last else (
+                                   -1, -1, self.in_channels)
+                x_var = fluid.data("input", input_shape, dtype=self.dtype)
+                w_var = fluid.data(
+                    "weight", self.weight_shape, dtype=self.dtype)
+                b_var = fluid.data(
+                    "bias", (self.out_channels, ), dtype=self.dtype)
+                y_var = F.conv_transpose1d(
+                    x_var,
+                    w_var,
+                    None if self.no_bias else b_var,
+                    output_size=self.output_size,
+                    padding=self.padding,
+                    output_padding=self.output_padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        conv = nn.ConvTranspose1d(
+            self.in_channels,
+            self.out_channels,
+            self.filter_size,
+            padding=self.padding,
+            output_padding=self.output_padding,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format)
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var, output_size=self.output_size)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.functional(place)
+        with dg.guard(place):
+            result2 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class ConvTranspose1dErrorTestCase(ConvTranspose1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest'))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', stride=[2], no_bias=True, dilation=2))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            filter_size=(3),
+            output_size=[36],
+            stride=[2],
+            dilation=2))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', stride=2, dilation=(2)))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', padding='valid'))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', filter_size=1, padding=3))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[2]))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', data_format="NLC"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', groups=2, padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            out_channels=6,
+            in_channels=3,
+            groups=3,
+            padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            data_format="NLC",
+            spartial_shape=16,
+            output_size=18))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', data_format="NLC", stride=3,
+            output_padding=2))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', data_format="not_valid"))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', in_channels=5, groups=2))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', stride=2, output_padding=3))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', output_size="not_valid"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index 64653ce2e7b8630030094b4004ecb17d56d3ff43..6bfe2aca530ddea6b49f12ad34dd9672e2a99ab5 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -20,6 +20,10 @@ import paddle.fluid.initializer as I
 import unittest
 
 
+def _reverse_repeat_list(t, n):
+    return list(x for x in reversed(t) for _ in range(n))
+
+
 class Conv2DTestCase(unittest.TestCase):
     def __init__(self,
                  methodName='runTest',
@@ -29,12 +33,11 @@ class Conv2DTestCase(unittest.TestCase):
                  num_filters=8,
                  filter_size=3,
                  padding=0,
+                 padding_mode='zeros',
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCHW",
                  dtype="float32"):
         super(Conv2DTestCase, self).__init__(methodName)
@@ -45,12 +48,16 @@ class Conv2DTestCase(unittest.TestCase):
         self.filter_size = filter_size
 
         self.padding = padding
+        if padding_mode in {'reflect', 'replicate', 'circular'}:
+            _paired_padding = fluid.layers.utils.convert_to_list(padding, 2,
+                                                                 'padding')
+            self._reversed_padding_repeated_twice = _reverse_repeat_list(
+                _paired_padding, 2)
+        self.padding_mode = padding_mode
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -91,19 +98,27 @@ class Conv2DTestCase(unittest.TestCase):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
+                if self.padding_mode != 'zeros':
+                    x_var = F.pad(x_var,
+                                  self._reversed_padding_repeated_twice,
+                                  mode=self.padding_mode,
+                                  data_format=self.data_format)
+                    padding = 0
+                else:
+                    padding = self.padding
+
                 y_var = fluid.layers.conv2d(
                     x_var,
                     self.num_filters,
                     self.filter_size,
-                    padding=self.padding,
+                    padding=padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
+
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -122,16 +137,24 @@ class Conv2DTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
+
+                if self.padding_mode != 'zeros':
+                    x_var = F.pad(x_var,
+                                  self._reversed_padding_repeated_twice,
+                                  mode=self.padding_mode,
+                                  data_format=self.data_format)
+                    padding = 0
+                else:
+                    padding = self.padding
+
                 y_var = F.conv2d(
                     x_var,
                     w_var,
                     b_var if not self.no_bias else None,
-                    padding=self.padding,
+                    padding=padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -143,18 +166,16 @@ class Conv2DTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2D(
+        conv = nn.Conv2d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
             padding=self.padding,
+            padding_mode=self.padding_mode,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -198,7 +219,7 @@ def add_cases(suite):
             methodName='runTest', stride=2, dilation=(2, 1)))
     suite.addTest(
         Conv2DTestCase(
-            methodName='runTest', padding="same", no_bias=True, act="sigmoid"))
+            methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
         Conv2DTestCase(
             methodName='runTest', filter_size=(3, 3), padding='valid'))
@@ -222,15 +243,28 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='reflect'))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='replicate'))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='circular'))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv2DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 989836d5993af5620a7b5fbd86c07b028e419fc4..ba450b345b8a309f5d7ff1e7a5c149809f55f46c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -29,13 +29,12 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                  num_filters=8,
                  filter_size=3,
                  output_size=None,
+                 output_padding=0,
                  padding=0,
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCHW",
                  dtype="float32"):
         super(Conv2DTransposeTestCase, self).__init__(methodName)
@@ -45,14 +44,13 @@ class Conv2DTransposeTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.output_size = output_size
+        self.output_padding = output_padding
 
         self.padding = padding
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -93,6 +91,7 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
+
                 y_var = fluid.layers.conv2d_transpose(
                     x_var,
                     self.num_filters,
@@ -104,8 +103,6 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -125,17 +122,22 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv2d_transpose(
+
+                if self.output_padding != 0:
+                    output_size = None
+                else:
+                    output_size = self.output_size
+
+                y_var = F.conv_transpose2d(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
-                    output_size=self.output_size,
+                    output_size=output_size,
                     padding=self.padding,
+                    output_padding=self.output_padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -147,32 +149,38 @@ class Conv2DTransposeTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2DTranspose(
+
+        if self.output_padding != 0:
+            output_size = None
+        else:
+            output_size = self.output_size
+
+        conv = nn.ConvTranspose2d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
-            output_size=self.output_size,
             padding=self.padding,
+            output_padding=self.output_padding,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
+        y_var = conv(x_var, output_size)
         y_np = y_var.numpy()
         return y_np
 
     def _test_equivalence(self, place):
         place = fluid.CPUPlace()
+
         result1 = self.fluid_layer(place)
         result2 = self.functional(place)
+
         with dg.guard(place):
             result3 = self.paddle_nn_layer()
+
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
 
@@ -194,7 +202,7 @@ class Conv2DTransposeErrorTestCase(Conv2DTransposeTestCase):
 
 
 def add_cases(suite):
-    suite.addTest(Conv2DTransposeTestCase(methodName='runTest', act="relu"))
+    suite.addTest(Conv2DTransposeTestCase(methodName='runTest'))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', stride=[1, 2], no_bias=True, dilation=2))
@@ -211,9 +219,6 @@ def add_cases(suite):
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', padding="valid"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding='valid'))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', filter_size=1, padding=(2, 3)))
@@ -240,15 +245,22 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
+    suite.addTest(
+        Conv2DTransposeTestCase(
+            methodName='runTest',
+            num_filters=6,
+            num_channels=3,
+            spartial_shape=(7, 7),
+            filter_size=[5, 5],
+            groups=1,
+            padding=2,
+            stride=2,
+            output_size=[14, 14],
+            output_padding=[1, 1], ))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv2DTransposeErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index f4418150e8a69d795ff544073b6ba6dd7431e44b..913db51da500b6c324abfab61744dfc1947bf7a5 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -77,8 +77,13 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
         output_size = attrs['output_size']
         out_h = output_size[0] + pad_h_0 + pad_h_1
         out_w = output_size[1] + pad_w_0 + pad_w_1
-
-    out = np.zeros((in_n, out_c, out_h, out_w), dtype=input_.dtype)
+    out_pad_h = 0
+    out_pad_w = 0
+    if 'output_padding' in attrs:
+        out_pad_h = attrs['output_padding'][0]
+        out_pad_w = attrs['output_padding'][1]
+    out = np.zeros(
+        (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype)
 
     for n in range(in_n):
         for i in range(in_h):
@@ -99,7 +104,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
                         out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
                             dilations[1]] += tmp_out
 
-    out = out[:, :, pad_h_0:out_h - pad_h_1, pad_w_0:out_w - pad_w_1]
+    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h, pad_w_0:out_w - pad_w_1
+              + out_pad_w]
     if attrs['data_format'] == 'NHWC':
         out = np.transpose(out, [0, 2, 3, 1])
     return out
@@ -114,6 +120,7 @@ class TestConv2dTransposeOp(OpTest):
         self.use_cudnn = False
         self.use_mkldnn = False
         self.output_size = None
+        self.output_padding = []
         self.data_format = "NCHW"
         self.pad = [0, 0]
         self.padding_algorithm = "EXPLICIT"
@@ -138,6 +145,9 @@ class TestConv2dTransposeOp(OpTest):
         if self.output_size is not None:
             self.attrs['output_size'] = self.output_size
 
+        if len(self.output_padding) > 0:
+            self.attrs['output_padding'] = self.output_padding
+
         output = conv2dtranspose_forward_naive(input_, filter_,
                                                self.attrs).astype(self.dtype)
 
@@ -290,6 +300,18 @@ class TestWithEvenUpsample(TestConv2dTransposeOp):
         self.filter_size = [f_c, 6, 5, 5]
 
 
+class TestWithEvenUpsampleOutputPadding(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 3, 7, 7]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 5, 5]
+
+
 class Test_NHWC(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0]
@@ -375,6 +397,19 @@ class TestWithEvenUpsample_NHWC(TestConv2dTransposeOp):
         self.data_format = 'NHWC'
 
 
+class TestWithEvenUpsample_NHWC_output_padding(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 7, 7, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 5, 5]
+        self.data_format = 'NHWC'
+
+
 # ------------ test_cudnn ------------
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index cf582c6210b76c6546de6d09d9219dbf4005bb17..56355a1c95e0396d0dec53cae02c3a99bf874013 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -32,9 +32,7 @@ class Conv3DTestCase(unittest.TestCase):
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCDHW",
                  dtype="float32"):
         super(Conv3DTestCase, self).__init__(methodName)
@@ -48,9 +46,7 @@ class Conv3DTestCase(unittest.TestCase):
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -101,8 +97,6 @@ class Conv3DTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -130,8 +124,6 @@ class Conv3DTestCase(unittest.TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -143,7 +135,7 @@ class Conv3DTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3D(
+        conv = nn.Conv3d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
@@ -151,10 +143,7 @@ class Conv3DTestCase(unittest.TestCase):
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -225,15 +214,10 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv3DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index acaf33467dbfc1c580ab3a36f08d0c2a26d7c239..e30f0cd3ecd0b872efa53c85e0666e4a6fb00a88 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -33,9 +33,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCDHW",
                  dtype="float32"):
         super(Conv3DTransposeTestCase, self).__init__(methodName)
@@ -50,9 +48,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -104,8 +100,6 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -125,7 +119,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d_transpose(
+                y_var = F.conv_transpose3d(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
@@ -134,8 +128,6 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -147,23 +139,19 @@ class Conv3DTransposeTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3DTranspose(
+        conv = nn.ConvTranspose3d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
-            output_size=self.output_size,
             padding=self.padding,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
+        y_var = conv(x_var, self.output_size)
         y_np = y_var.numpy()
         return y_np
 
@@ -194,7 +182,7 @@ class Conv3DTransposeErrorTestCase(Conv3DTransposeTestCase):
 
 
 def add_cases(suite):
-    suite.addTest(Conv3DTransposeTestCase(methodName='runTest', act="tanh"))
+    suite.addTest(Conv3DTransposeTestCase(methodName='runTest'))
     suite.addTest(
         Conv3DTransposeTestCase(
             methodName='runTest', stride=[1, 2, 1], dilation=2, no_bias=True))
@@ -240,15 +228,10 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv3DTransposeErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e25613fa63da440f71f23841095f153e61735e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -0,0 +1,140 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+
+from paddle.fluid import Program, program_guard, Executor, default_main_program
+
+
+class TestCosineSimilarityAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def _get_numpy_out(self, x1, x2, axis=1, eps=1e-8):
+        w12 = np.sum(x1 * x2, axis=axis)
+        w1 = np.sum(x1 * x1, axis=axis)
+        w2 = np.sum(x2 * x2, axis=axis)
+        n12 = np.sqrt(np.clip(w1 * w2, eps * eps, None))
+        cos_sim = w12 / n12
+        return cos_sim
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+
+        with program_guard(Program(), Program()):
+            shape = [10, 15]
+            axis = 1
+            eps = 1e-8
+            np.random.seed(0)
+            np_x1 = np.random.rand(*shape).astype(np.float32)
+            np_x2 = np.random.rand(*shape).astype(np.float32)
+
+            x1 = paddle.data(name="x1", shape=shape)
+            x2 = paddle.data(name="x2", shape=shape)
+            result = F.cosine_similarity(x1, x2, axis=axis, eps=eps)
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x1": np_x1,
+                                    "x2": np_x2},
+                              fetch_list=[result])
+
+            np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+            self.assertTrue(np.allclose(fetches[0], np_out))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph_1(self):
+        paddle.disable_static()
+
+        shape = [10, 15]
+        axis = 1
+        eps = 1e-8
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape).astype(np.float32)
+        np_x2 = np.random.rand(*shape).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        shape = [12, 13]
+        axis = 0
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape).astype(np.float32)
+        np_x2 = np.random.rand(*shape).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_3(self):
+        paddle.disable_static()
+
+        shape1 = [10, 12, 10]
+        shape2 = [10, 1, 10]
+        axis = 2
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape1).astype(np.float32)
+        np_x2 = np.random.rand(*shape2).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_4(self):
+        paddle.disable_static()
+
+        shape1 = [23, 12, 1]
+        shape2 = [23, 1, 10]
+        axis = 2
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape1).astype(np.float32)
+        np_x2 = np.random.rand(*shape2).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = cos_sim_func(tesnor_x1, tesnor_x2)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 7f667d6b71c7f52f6d5afb42045c2da0cc45587b..4982cd195820811b9a8ec3fe6d01955234032120 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -535,5 +535,443 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
 
+class FuncCrossEntropyLoss(unittest.TestCase):
+    #1
+    def test_cross_entropy_loss_1d_with_weight_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np)[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #2
+    def test_cross_entropy_loss_1d_with_weight_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #3
+    def test_cross_entropy_loss_1d_with_weight_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #4
+    def test_cross_entropy_loss_1d_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np)[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #5
+    def test_cross_entropy_loss_1d_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #6
+    def test_cross_entropy_loss_1d_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #7
+    def test_cross_entropy_loss_2d_with_weight_none(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #8
+    def test_cross_entropy_loss_2d_with_weight_mean(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='mean')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='mean')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #9
+    def test_cross_entropy_loss_2d_with_weight_sum(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #10
+    def test_cross_entropy_loss_2d_none(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #11
+    def test_cross_entropy_loss_2d_mean(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='mean')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='mean')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #12
+    def test_cross_entropy_loss_2d_sum(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(input_np, label_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index a1a80bfdb549fe509171d4ed3d320547aa5aec51..ad121fac8cc045e67cf116d2cf9cedd6ac9bef99 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -17,9 +17,92 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+from paddle import to_variable
+
+
+class TestCumsumOp(unittest.TestCase):
+    def run_cases(self):
+        data_np = np.arange(12).reshape(3, 4)
+        data = to_variable(data_np)
+
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, dtype='float64')
+        self.assertTrue(y.dtype == core.VarDesc.VarType.FP64)
+
+        y = paddle.cumsum(data, dtype=np.int32)
+        self.assertTrue(y.dtype == core.VarDesc.VarType.INT32)
+
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+    def run_static(self, use_gpu=False):
+        with fluid.program_guard(fluid.Program()):
+            data_np = np.random.random((100, 100)).astype(np.float32)
+            x = paddle.static.data('X', [100, 100])
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, dtype='float64')
+            y5 = paddle.cumsum(x, dtype=np.int32)
+            y6 = paddle.cumsum(x, axis=-2)
+
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            out = exe.run(feed={'X': data_np},
+                          fetch_list=[
+                              y.name, y2.name, y3.name, y4.name, y5.name,
+                              y6.name
+                          ])
+
+            z = np.cumsum(data_np)
+            self.assertTrue(np.allclose(z, out[0]))
+            z = np.cumsum(data_np, axis=0)
+            self.assertTrue(np.allclose(z, out[1]))
+            z = np.cumsum(data_np, axis=-1)
+            self.assertTrue(np.allclose(z, out[2]))
+            self.assertTrue(out[3].dtype == np.float64)
+            self.assertTrue(out[4].dtype == np.int32)
+            z = np.cumsum(data_np, axis=-2)
+            self.assertTrue(np.allclose(z, out[5]))
+
+    def test_cpu(self):
+        paddle.disable_static(paddle.fluid.CPUPlace())
+        self.run_cases()
+        paddle.enable_static()
+
+        self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        paddle.disable_static(paddle.fluid.CUDAPlace(0))
+        self.run_cases()
+        paddle.enable_static()
+
+        self.run_static(use_gpu=True)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.cumsum(x, name='out')
+            self.assertTrue('out' in y.name)
 
 
 class TestSumOp1(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 22dc72048e429ed257e9d7d1213b6cb7dcafbf1a..8070148f8b36dd7dab7711abaf25994acebc7e6f 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -16,9 +16,11 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import Program, program_guard
+import paddle.fluid.core as core
 
 
 class TestApiDataError(unittest.TestCase):
@@ -53,5 +55,49 @@ class TestApiDataError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_type)
 
 
+class TestApiStaticDataError(unittest.TestCase):
+    def test_fluid_dtype(self):
+        with program_guard(Program(), Program()):
+            x1 = paddle.static.data(name="x1", shape=[2, 25])
+            self.assertEqual(x1.dtype, core.VarDesc.VarType.FP32)
+
+            x2 = paddle.static.data(name="x2", shape=[2, 25], dtype="bool")
+            self.assertEqual(x2.dtype, core.VarDesc.VarType.BOOL)
+
+            paddle.set_default_dtype("float64")
+            x3 = paddle.static.data(name="x3", shape=[2, 25])
+            self.assertEqual(x3.dtype, core.VarDesc.VarType.FP64)
+
+    def test_fluid_data(self):
+        with program_guard(Program(), Program()):
+
+            # 1. The type of 'name' in fluid.data must be str.
+            def test_name_type():
+                paddle.static.data(name=1, shape=[2, 25], dtype="bool")
+
+            self.assertRaises(TypeError, test_name_type)
+
+            # 2. The type of 'shape' in fluid.data must be list or tuple.
+            def test_shape_type():
+                paddle.static.data(name='data1', shape=2, dtype="bool")
+
+            self.assertRaises(TypeError, test_shape_type)
+
+    def test_layers_data(self):
+        with program_guard(Program(), Program()):
+
+            # 1. The type of 'name' in layers.data must be str.
+            def test_name_type():
+                paddle.static.data(name=1, shape=[2, 25], dtype="bool")
+
+            self.assertRaises(TypeError, test_name_type)
+
+            # 2. The type of 'shape' in layers.data must be list or tuple.
+            def test_shape_type():
+                paddle.static.data(name='data1', shape=2, dtype="bool")
+
+            self.assertRaises(TypeError, test_shape_type)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
index c766cf17f422205521641ae44ab2060b4ab6e81c..cefef9ff9183e34d1ae7ae3e9a2f88969bf094a6 100644
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
@@ -271,7 +271,7 @@ class TestDataNormOpWithEnableScaleAndShift(OpTest):
         self.use_mkldnn = False
         epsilon = 0.00001
         slot_dim = -1
-        enable_scale_and_shitf = True
+        enable_scale_and_shift = True
         x_shape = [2, 50]
         scale_shape = [50]
         tp = np.float32
@@ -319,6 +319,63 @@ class TestDataNormOpWithEnableScaleAndShift(OpTest):
         self.check_grad(['X'], 'Y', no_grad_set=set([]))
 
 
+class TestDataNormOpWithoutEnableScaleAndShift(OpTest):
+    """
+    test class for data norm op
+    test forward and backward
+    """
+
+    def setUp(self):
+        """
+        init data norm op test env
+        """
+        self.op_type = 'data_norm'
+        self.use_mkldnn = False
+        epsilon = 0.00001
+        slot_dim = -1
+        enable_scale_and_shift = True
+        x_shape = [2, 50]
+        scale_shape = [50]
+        tp = np.float32
+
+        x_val = np.random.uniform(-1, 1, x_shape).astype(tp)
+        batch_size = np.ones(scale_shape).astype(tp)
+        batch_size *= 1e4
+        batch_sum = np.zeros(scale_shape).astype(tp)
+        batch_square_sum = np.ones(scale_shape).astype(tp)
+        batch_square_sum *= 1e4
+        scale_w = np.ones(scale_shape).astype(tp)
+        bias = np.zeros(scale_shape).astype(tp)
+
+        y = np.array(x_val)
+
+        mean = np.zeros(x_shape).astype(tp)
+        scale = np.ones(x_shape).astype(tp)
+
+        self.inputs = {
+            "X": x_val,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum,
+            "scale_w": scale_w,
+            "bias": bias
+        }
+        self.outputs = {"Y": y, "Means": mean, "Scales": scale}
+        self.attrs = {"epsilon": epsilon, "use_mkldnn": self.use_mkldnn}
+
+    def test_check_output(self):
+        """
+        test check forward, check output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        test check backward, check grad
+        """
+        self.check_grad(['X'], 'Y', no_grad_set=set([]))
+
+
 class TestDataNormOpWithEnableScaleAndShift_1(OpTest):
     """
     test class for data norm op
@@ -333,7 +390,7 @@ class TestDataNormOpWithEnableScaleAndShift_1(OpTest):
         self.use_mkldnn = False
         epsilon = 0.00001
         slot_dim = 1
-        enable_scale_and_shitf = True
+        enable_scale_and_shift = True
         x_shape = [2, 50]
         scale_shape = [50]
         tp = np.float32
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 90d5f58539500b87e999bec4c475ec40d0f55483..582bb3dcc681921cdbf2111dcd26b299f06a3058 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -38,25 +38,25 @@ class TestDataset(unittest.TestCase):
     def test_dataset_create(self):
         """ Testcase for dataset create. """
         try:
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "InMemoryDataset")
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "QueueDataset")
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "FileInstantDataset")
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "MyOwnDataset")
             self.assertTrue(False)
         except:
@@ -95,7 +95,7 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
@@ -176,7 +176,7 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
@@ -228,7 +228,7 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
@@ -300,7 +300,7 @@ class TestDataset(unittest.TestCase):
                     name=slot, shape=[1], dtype="float32", lod_level=1)
                 slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(1)
@@ -367,7 +367,7 @@ class TestDataset(unittest.TestCase):
                 name="slot4", shape=[1], dtype="float32", lod_level=0)
             slots_vars = [var1, var2, var3, var4]
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(1)
@@ -423,7 +423,7 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
@@ -517,7 +517,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(
@@ -542,7 +543,8 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset2 = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
+        dataset2 = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset2.set_use_var(slots_vars)
         dataset2.set_batch_size(32)
         dataset2.set_thread(3)
@@ -583,7 +585,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(
@@ -638,7 +641,7 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[None, 1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_input_type(1)
         dataset.set_batch_size(1)
@@ -718,7 +721,8 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
             inputs(list): inputs of get_dataset
             files(list): files of  get_dataset
         """
-        dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(files)
@@ -875,7 +879,7 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "InMemoryDataset")
             dataset.set_batch_size(32)
             dataset.set_thread(3)
@@ -945,7 +949,7 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "InMemoryDataset")
             dataset.set_batch_size(32)
             dataset.set_thread(3)
@@ -962,12 +966,12 @@ class TestDataset2(unittest.TestCase):
                 print("warning: catch expected error")
             fleet._opt_info = None
             fleet._fleet_ptr = None
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "InMemoryDataset")
             dataset.set_rank_offset("")
             dataset.set_pv_batch_size(1)
             dataset.set_hdfs_config("", "")
-            d = paddle.fleet.DatasetBase()
+            d = paddle.distributed.fleet.DatasetBase()
             try:
                 dataset.set_feed_type("MultiSlotInMemoryDataFeed")
             except:
@@ -1000,7 +1004,7 @@ class TestDataset2(unittest.TestCase):
             dataset.get_pv_data_size()
             dataset.get_memory_data_size()
             dataset.get_shuffle_data_size()
-            dataset = paddle.fleet.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "QueueDataset")
             try:
                 dataset.local_shuffle()
@@ -1010,7 +1014,7 @@ class TestDataset2(unittest.TestCase):
                 dataset.global_shuffle()
             except:
                 print("warning: catch expected error")
-            dataset = paddle.fleet.FileInstantDataset()
+            dataset = paddle.distributed.fleet.FileInstantDataset()
             try:
                 dataset.local_shuffle()
             except:
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index 22d59e78fff867279880ddc283e096f4848512d0..c13c33f209f0f7d0fff95bdfb5b4e551a145b87e 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -97,7 +97,7 @@ class DatasetLoaderTestBase(unittest.TestCase):
 
     def check_batch_number(self, place, randomize_batch_num=False):
         main_prog, startup_prog, feeds = self.build_network()
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             self.dataset_name)
         dataset.set_batch_size(BATCH_SIZE)
 
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index f8cb6170be945ed628440b5a068f1acd0ac26503..a16f21c0f97c0902dd6c26561ed3f707b28ff947 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -122,14 +122,8 @@ class TestBase(unittest.TestCase):
                             label = item['label']
                             assert image.shape() == [BATCH_SIZE, 784]
                             assert label.shape() == [BATCH_SIZE, 1]
-                            if ps[i]._equals(fluid.CPUPlace()):
-                                assert image._place()._equals(fluid.CPUPlace())
-                                assert label._place()._equals(fluid.CPUPlace())
-                            else:
-                                assert image._place()._equals(
-                                    fluid.CUDAPinnedPlace())
-                                assert label._place()._equals(
-                                    fluid.CUDAPinnedPlace())
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
                         L, = exe.run(program=prog,
                                      feed=d,
                                      fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..057933fc7a735c2732cd651e83e99ddfa747b8a8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -0,0 +1,61 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from paddle.framework import set_default_dtype, get_default_dtype
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear
+import paddle.fluid.core as core
+from paddle import to_variable
+
+
+class TestDefaultType(unittest.TestCase):
+    def check_default(self):
+        self.assertEqual("float32", get_default_dtype())
+
+    def test_api(self):
+        self.check_default()
+
+        set_default_dtype("float64")
+        self.assertEqual("float64", get_default_dtype())
+
+        set_default_dtype("float32")
+        self.assertEqual("float32", get_default_dtype())
+
+        set_default_dtype("float16")
+        self.assertEqual("float16", get_default_dtype())
+
+        set_default_dtype(np.float64)
+        self.assertEqual("float64", get_default_dtype())
+
+        set_default_dtype(np.float32)
+        self.assertEqual("float32", get_default_dtype())
+
+        set_default_dtype(np.float16)
+        self.assertEqual("float16", get_default_dtype())
+
+
+class TestRaiseError(unittest.TestCase):
+    def test_error(self):
+        self.assertRaises(TypeError, set_default_dtype, "int32")
+        self.assertRaises(TypeError, set_default_dtype, np.int32)
+        self.assertRaises(TypeError, set_default_dtype, "int64")
+        self.assertRaises(TypeError, set_default_dtype, np.int64)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ab56f9244f93266b90f3316bc2c2be5623e0ee7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from op_test import OpTest
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import warnings
+import paddle
+
+
+class TestStaticDeviceManage(unittest.TestCase):
+    def test_cpu_device(self):
+        paddle.set_device('cpu')
+        out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+        out2 = paddle.ones(shape=[1, 3], dtype='float32')
+        out3 = paddle.concat(x=[out1, out2], axis=0)
+        exe = paddle.fluid.Executor()
+        exe.run(paddle.fluid.default_startup_program())
+        res = exe.run(fetch_list=[out3])
+        device = paddle.get_device()
+        self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
+        self.assertEqual(device, "cpu")
+
+    def test_gpu_device(self):
+        if core.is_compiled_with_cuda():
+            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            out2 = paddle.ones(shape=[1, 3], dtype='float32')
+            out3 = paddle.concat(x=[out1, out2], axis=0)
+            paddle.set_device('gpu:0')
+            exe = paddle.fluid.Executor()
+            exe.run(paddle.fluid.default_startup_program())
+            res = exe.run(fetch_list=[out3])
+            device = paddle.get_device()
+            self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
+            self.assertEqual(device, "gpu:0")
+
+
+class TestImperativeDeviceManage(unittest.TestCase):
+    def test_cpu(self):
+        with fluid.dygraph.guard():
+            paddle.set_device('cpu')
+            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            out2 = paddle.ones(shape=[1, 3], dtype='float32')
+            out3 = paddle.concat(x=[out1, out2], axis=0)
+            device = paddle.get_device()
+            self.assertEqual(
+                isinstance(framework._current_expected_place(), core.CPUPlace),
+                True)
+            self.assertEqual(device, "cpu")
+
+    def test_gpu(self):
+        if core.is_compiled_with_cuda():
+            with fluid.dygraph.guard():
+                paddle.set_device('gpu:0')
+                out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+                out2 = paddle.ones(shape=[1, 3], dtype='float32')
+                out3 = paddle.concat(x=[out1, out2], axis=0)
+                device = paddle.get_device()
+                self.assertEqual(
+                    isinstance(framework._current_expected_place(),
+                               core.CUDAPlace), True)
+                self.assertEqual(device, "gpu:0")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index b6566676d2533aad5272fe61dbedbc1d55ea213b..780d57b53310bb5f385a131d4ad52dd6f5e695f0 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -17,11 +17,181 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid import Program, program_guard
 
 
+class TestDiagV2Op(OpTest):
+    def setUp(self):
+        self.op_type = "diag_v2"
+        self.x = np.random.rand(10, 10)
+        self.offset = 0
+        self.padding_value = 0.0
+        self.out = np.diag(self.x, self.offset)
+
+        self.init_config()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'offset': self.offset,
+            'padding_value': self.padding_value
+        }
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_config(self):
+        pass
+
+
+class TestDiagV2OpCase1(TestDiagV2Op):
+    def init_config(self):
+        self.offset = 1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase2(TestDiagV2Op):
+    def init_config(self):
+        self.offset = -1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase3(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase4(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.rand(100)
+        self.padding_value = 8
+        n = self.x.size
+        self.out = self.padding_value * np.ones((n, n)) + np.diag(
+            self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
+
+
+class TestDiagV2Error(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_diag_v2_type():
+                x = [1, 2, 3]
+                output = paddle.diag(x)
+
+            self.assertRaises(TypeError, test_diag_v2_type)
+
+            x = paddle.static.data('data', [3, 3])
+            self.assertRaises(TypeError, paddle.diag, x, offset=2.5)
+
+            self.assertRaises(TypeError, paddle.diag, x, padding_value=[9])
+
+            x = paddle.static.data('data2', [3, 3, 3])
+            self.assertRaises(ValueError, paddle.diag, x)
+
+
+class TestDiagV2API(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
+        self.expected0 = np.diag(self.input_np)
+        self.expected1 = np.diag(self.input_np, k=1)
+        self.expected2 = np.diag(self.input_np, k=-1)
+
+        self.input_np2 = np.random.rand(100)
+        self.offset = 0
+        self.padding_value = 8
+        n = self.input_np2.size
+        self.expected3 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np2, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.input_np3 = np.random.randint(-10, 10, size=(100)).astype(np.int64)
+        self.padding_value = 8.0
+        n = self.input_np3.size
+        self.expected4 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.padding_value = -8
+        self.expected5 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = paddle.diag(x, padding_value=8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        x = paddle.to_tensor(self.input_np3)
+        y = paddle.diag(x, padding_value=8.0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected4))
+
+        y = paddle.diag(x, padding_value=-8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected5))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.data(name='input2', shape=[100], dtype='float64')
+        x3 = paddle.data(name='input3', shape=[100], dtype='int64')
+        result0 = paddle.diag(x)
+        result1 = paddle.diag(x, offset=1)
+        result2 = paddle.diag(x, offset=-1)
+        result3 = paddle.diag(x, name='aaa')
+        result4 = paddle.diag(x2, padding_value=8)
+        result5 = paddle.diag(x3, padding_value=8.0)
+        result6 = paddle.diag(x3, padding_value=-8)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        res0, res1, res2, res4, res5, res6 = exe.run(
+            feed={
+                "input": self.input_np,
+                "input2": self.input_np2,
+                'input3': self.input_np3
+            },
+            fetch_list=[result0, result1, result2, result4, result5, result6])
+
+        self.assertTrue(np.allclose(res0, self.expected0))
+        self.assertTrue(np.allclose(res1, self.expected1))
+        self.assertTrue(np.allclose(res2, self.expected2))
+        self.assertTrue('aaa' in result3.name)
+        self.assertTrue(np.allclose(res4, self.expected3))
+        self.assertTrue(np.allclose(res5, self.expected4))
+        self.assertTrue(np.allclose(res6, self.expected5))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+
 class TestDiagOp(OpTest):
     def setUp(self):
         self.op_type = "diag"
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
new file mode 100644
index 0000000000000000000000000000000000000000..74cc87bd9dbd691c6a1683ac44cba246e67c4af2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import time
+import subprocess
+import unittest
+import numpy as np
+import paddle
+
+
+class TestDirectory(unittest.TestCase):
+    def get_import_command(self, module):
+        paths = module.split('.')
+        if len(paths) == 1:
+            return 'import {}'.format(module)
+        package = '.'.join(paths[:-1])
+        func = paths[-1]
+        cmd = 'from {} import {}'.format(package, func)
+        return cmd
+
+    def test_new_directory(self):
+        new_directory = [
+            'paddle.enable_static', 'paddle.disable_static',
+            'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
+            'paddle.no_grad', 'paddle.save', 'paddle.load',
+            'paddle.static.save', 'paddle.static.load', 'paddle.ParallelEnv',
+            'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
+            'paddle.jit.TracedLayer', 'paddle.jit.to_static',
+            'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
+            'paddle.NoamDecay', 'paddle.PiecewiseDecay',
+            'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
+            'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
+            'paddle.CosineDecay', 'paddle.static.Executor',
+            'paddle.static.global_scope', 'paddle.static.scope_guard',
+            'paddle.static.append_backward', 'paddle.static.gradients',
+            'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
+            'paddle.static.ExecutionStrategy',
+            'paddle.static.default_main_program',
+            'paddle.static.default_startup_program', 'paddle.static.Program',
+            'paddle.static.name_scope', 'paddle.static.program_guard',
+            'paddle.static.Print', 'paddle.static.py_func',
+            'paddle.static.ParallelExecutor',
+            'paddle.static.WeightNormParamAttr', 'paddle.static.nn.fc',
+            'paddle.static.nn.batch_norm',
+            'paddle.static.nn.bilinear_tensor_product',
+            'paddle.static.nn.conv2d', 'paddle.static.nn.conv2d_transpose',
+            'paddle.static.nn.conv3d', 'paddle.static.nn.conv3d_transpose',
+            'paddle.static.nn.create_parameter',
+            'paddle.static.nn.crf_decoding', 'paddle.static.nn.data_norm',
+            'paddle.static.nn.deformable_conv', 'paddle.static.nn.group_norm',
+            'paddle.static.nn.hsigmoid', 'paddle.static.nn.instance_norm',
+            'paddle.static.nn.layer_norm', 'paddle.static.nn.multi_box_head',
+            'paddle.static.nn.nce', 'paddle.static.nn.prelu',
+            'paddle.static.nn.row_conv', 'paddle.static.nn.spectral_norm',
+            'paddle.static.nn.embedding'
+        ]
+
+        import_file = 'run_import_modules.py'
+
+        with open(import_file, "w") as wb:
+            for module in new_directory:
+                run_cmd = self.get_import_command(module)
+                wb.write("{}\n".format(run_cmd))
+
+        _python = sys.executable
+
+        ps_cmd = "{} {}".format(_python, import_file)
+        ps_proc = subprocess.Popen(
+            ps_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        stdout, stderr = ps_proc.communicate()
+
+        assert "Error" not in str(stderr), "Error: Can't" \
+            " import Module {}".format(module)
+
+    def test_old_directory(self):
+        old_directory = [
+            'paddle.enable_imperative', 'paddle.disable_imperative',
+            'paddle.in_imperative_mode', 'paddle.imperative.to_variable',
+            'paddle.imperative.enable', 'paddle.imperative.guard',
+            'paddle.imperative.grad', 'paddle.imperative.no_grad',
+            'paddle.imperative.save', 'paddle.imperative.load',
+            'paddle.imperative.ParallelEnv',
+            'paddle.imperative.prepare_context',
+            'paddle.imperative.DataParalell', 'paddle.imperative.jit',
+            'paddle.imperative.TracedLayer', 'paddle.imperative.declarative',
+            'paddle.imperative.ProgramTranslator',
+            'paddle.imperative.TranslatedLayer', 'paddle.imperative.jit.save',
+            'paddle.imperative.jit.load',
+            'paddle.imperative.jit.SaveLoadConfig',
+            'paddle.imperative.NoamDecay'
+            'paddle.imperative.PiecewiseDecay',
+            'paddle.imperative.NaturalExpDecay',
+            'paddle.imperative.ExponentialDecay',
+            'paddle.imperative.InverseTimeDecay',
+            'paddle.imperative.PolynomialDecay',
+            'paddle.imperative.CosineDecay', 'paddle.Executor',
+            'paddle.global_scope', 'paddle.scope_guard',
+            'paddle.append_backward', 'paddle.gradients',
+            'paddle.BuildStrategy', 'paddle.CompiledProgram',
+            'paddle.ExecutionStrategy', 'paddle.name_scope',
+            'paddle.program_guard', 'paddle.Print', 'paddle.py_func',
+            'paddle.ParallelExecutor', 'paddle.default_main_program',
+            'paddle.default_startup_program', 'paddle.Program',
+            'paddle.WeightNormParamAttr', 'paddle.declarative.fc',
+            'paddle.declarative.batch_norm',
+            'paddle.declarative.bilinear_tensor_product',
+            'paddle.declarative.conv2d', 'paddle.declarative.conv2d_transpose',
+            'paddle.declarative.conv3d', 'paddle.declarative.conv3d_transpose',
+            'paddle.declarative.create_parameter',
+            'paddle.declarative.crf_decoding', 'paddle.declarative.data_norm',
+            'paddle.declarative.deformable_conv',
+            'paddle.declarative.group_norm', 'paddle.declarative.hsigmoid',
+            'paddle.declarative.instance_norm', 'paddle.declarative.layer_norm',
+            'paddle.declarative.multi_box_head', 'paddle.declarative.nce',
+            'paddle.declarative.prelu', 'paddle.declarative.row_conv',
+            'paddle.declarative.spectral_norm', 'paddle.declarative.embedding'
+        ]
+
+        import_file = 'run_old_import_modules.py'
+
+        with open(import_file, "w") as wb:
+            cmd_context_count = """
+count = 0
+err_module = ""
+"""
+            wb.write(cmd_context_count)
+            for module in old_directory:
+                run_cmd = self.get_import_command(module)
+                cmd_context_loop_template = """
+try:
+    {run_cmd}
+except:
+    count += 1
+else:
+    err_module = "{module}"
+"""
+                cmd_context_loop = cmd_context_loop_template.format(
+                    run_cmd=run_cmd, module=module)
+                wb.write(cmd_context_loop)
+            cmd_context_print_template = """
+if count != {len_old_directory}:
+    print("Error: Module " + err_module + " should not be imported")
+"""
+            cmd_context_print = cmd_context_print_template.format(
+                len_old_directory=str(len(old_directory)))
+            wb.write(cmd_context_print)
+
+        _python = sys.executable
+
+        ps_cmd = "{} {}".format(_python, import_file)
+        ps_proc = subprocess.Popen(
+            ps_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        stdout, stderr = ps_proc.communicate()
+
+        assert "Error" not in str(stdout), str(stdout)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df55a6b873e28a6e479fd05b31074802eb19bb7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import unittest
+
+import paddle
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer_trainer(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier")
+
+        sends = 0
+        sgds = 0
+        for op in prog.global_block().ops:
+            if op.type == "send":
+                sends += 1
+            if op.type == "sgd":
+                sgds += 1
+        self.assertEqual(sends, 7)
+        self.assertEqual(sgds, 0)
+
+        fleet.init_worker()
+        time.sleep(8)
+        fleet.stop_worker()
+
+    def test_a_sync_optimizer_pserver(self):
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
+        fleet.init_server()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ca41a11e325cfb66a3a3eaadb4eca6f9764212
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer_trainer(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[-1].type, "send")
+
+        sends = 0
+        sgds = 0
+
+        for op in prog.global_block().ops:
+            if op.type == "send":
+                sends += 1
+            if op.type == "sgd":
+                sgds += 1
+        self.assertEqual(sends, 1)
+        self.assertEqual(sgds, 6)
+
+    def test_a_sync_optimizer_pserver(self):
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0993e022e1b9570773634ec829b088c5ff145ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "6007"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_gradient_merge_optimizer(self):
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")
+
+        sends = 0
+        sgds = 0
+        for op in prog.global_block().ops:
+            if op.type == "send":
+                sends += 1
+            if op.type == "sgd":
+                sgds += 1
+        self.assertEqual(sends, 6)
+        self.assertEqual(sgds, 0)
+
+        fleet.init_worker()
+        time.sleep(8)
+        fleet.stop_worker()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 8b2f7118ea766a0a2e5a7f74daa243b99f64129d..beb0069eb770f25d7834749ff9c188e5252e13c0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -31,10 +31,11 @@ import time
 import tempfile
 import unittest
 
+import paddle
 import paddle.fluid as fluid
-import paddle.fleet.base.role_maker as role_maker
-from paddle.fleet.base.util_factory import fleet_util
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']
@@ -56,7 +57,7 @@ class FleetDistRunnerBase(object):
         if args.role.upper() == "PSERVER":
             role = role_maker.UserDefinedRoleMaker(
                 is_collective=False,
-                init_gloo=True,
+                init_gloo=False,
                 path=args.gloo_path,
                 current_id=args.current_id,
                 role=role_maker.Role.SERVER,
@@ -65,7 +66,7 @@ class FleetDistRunnerBase(object):
         else:
             role = role_maker.UserDefinedRoleMaker(
                 is_collective=False,
-                init_gloo=True,
+                init_gloo=False,
                 path=args.gloo_path,
                 current_id=args.current_id,
                 role=role_maker.Role.WORKER,
@@ -75,21 +76,23 @@ class FleetDistRunnerBase(object):
         return role
 
     def build_strategy(self, args):
-        self.strategy = None
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = False
         if args.mode == "async":
-            self.strategy = StrategyFactory.create_async_strategy()
-        elif args.mode == "sync":
-            self.strategy = StrategyFactory.create_sync_strategy()
-        elif args.mode == "half_async":
-            self.strategy = StrategyFactory.create_half_async_strategy()
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = True
         elif args.mode == "geo":
-            self.strategy = StrategyFactory.create_geo_strategy(
-                args.geo_sgd_need_push_nums)
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = True
+            self.strategy.a_sync_configs = {
+                "k_steps": args.geo_sgd_need_push_nums
+            }
         self.dump_param = os.getenv("dump_param", "").split(",")
         self.dump_fields = os.getenv("dump_fields", "").split(",")
         self.dump_fields_path = os.getenv("dump_fields_path", "")
         debug = int(os.getenv("Debug", "0"))
-        if debug:
+        # TODO(update strategy to support dump params)
+        if False:  #debug:
             self.strategy.set_debug_opt({
                 "dump_param": self.dump_param,
                 "dump_fields": self.dump_fields,
@@ -122,7 +125,7 @@ class FleetDistRunnerBase(object):
                     staircase=True))
         else:
             optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
     def run_pserver(self, args):
@@ -157,7 +160,13 @@ class TestFleetBase(unittest.TestCase):
     def _setup_config(self):
         raise NotImplementedError("tests should have _setup_config implemented")
 
+    def tearDown(self):
+        t = time.time() - self.startTime
+        print('%s: %.3f' % (self.__class__.__name__, t))
+
     def setUp(self):
+        self.startTime = time.time()
+
         self._mode = "sync"
         self._reader = "pyreader"
         self._trainers = 2
@@ -278,6 +287,23 @@ class TestFleetBase(unittest.TestCase):
 
         tr0_ret = tr0.returncode
         tr1_ret = tr0.returncode
+        if tr0_ret != 0:
+            print(
+                "========================Error tr0_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
+            print(
+                "========================Error tr0_err end==========================="
+            )
+
+        if tr1_ret != 0:
+            print(
+                "========================Error tr1_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
+            print(
+                "========================Error tr1_err end==========================="
+            )
 
         self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
         self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 18629c4f996a6d068339bd6cad494e8e8d21123f..b506f179143412e2bdb5d9eda511d90a0a3eea6d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -22,7 +22,7 @@ from test_dist_fleet_base import TestFleetBase
 
 class TestDistMnistSync2x2(TestFleetBase):
     def _setup_config(self):
-        self._mode = "async"
+        self._mode = "sync"
         self._reader = "pyreader"
 
     def check_with_place(self,
@@ -123,7 +123,7 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
 
 class TestDistCtrHalfAsync2x2(TestFleetBase):
     def _setup_config(self):
-        self._mode = "half_async"
+        self._mode = "async"
         self._reader = "pyreader"
 
     def check_with_place(self,
@@ -156,5 +156,40 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
+class TestDistCtrPsGpuPyreaderAsync2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
+            "http_proxy": "",
+            "FLAGS_communicator_send_queue_size": "2",
+            "FLAGS_communicator_max_merge_var_num": "2",
+            "CPU_NUM": "2",
+            "SAVE_MODEL": "1"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr_ps_gpu.py", delta=1e-5, check_error_log=True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bc0d8dadce44c8f711189466f34fb5cd76f39f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+import unittest
+import subprocess
+import time
+import paddle.fluid as fluid
+#import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+from test_dist_fleet_base import TestFleetBase
+
+#from dist_simnet_bow import train_network
+
+
+class TestDistGloo_2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "sync"
+        self._reader = "pyreader"
+        self._path = "./tmp4"
+        if (os.path.exists(self._path)):
+            shutil.rmtree(self._path)
+        # if not os.path.exists(self._path):
+        #      os.mkdir(self._path)
+
+    def _start_pserver(self, cmd, required_envs):
+        #env.update(required_envs)
+        ps0_cmd = cmd
+        ps1_cmd = cmd
+
+        ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
+        ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
+
+        required_envs["POD_IP"] = "127.0.0.1"
+        required_envs["PADDLE_PSERVER_ID"] = "0"
+        required_envs["PADDLE_PORT"] = "36011"
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
+        print("PADDLE_PSERVER_ID=0:")
+        print(required_envs)
+        required_envs["PADDLE_PSERVER_ID"] = "1"
+        required_envs["PADDLE_PORT"] = "36012"
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
+        print("PADDLE_PSERVER_ID=1:")
+        print(required_envs)
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+    def _start_trainer(self, cmd, required_envs):
+        #env.update(required_envs)
+
+        tr0_cmd = cmd
+        tr1_cmd = cmd
+
+        tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
+        tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
+        required_envs["PADDLE_TRAINER_ID"] = "0"
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=required_envs)
+        print("PADDLE_TRAINER_ID=0:")
+        print(required_envs)
+        required_envs["PADDLE_TRAINER_ID"] = "1"
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=required_envs)
+        print("PADDLE_TRAINER_ID=1:")
+        print(required_envs)
+        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
+
+    def _run_cluster(self, model, envs):
+        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        python_path = self._python_interp
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
+            python_path += " -m coverage run --branch -p"
+        env.update(envs)
+
+        tr_cmd = "{0} {1}".format(python_path, model)
+
+        ps_cmd = "{0} {1}".format(python_path, model)
+
+        # Run dist train to compare with local results
+        env["TRAINING_ROLE"] = "PSERVER"
+        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
+        print(ps_cmd)
+        env["TRAINING_ROLE"] = "TRAINER"
+        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
+
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+
+        while True:
+            stat1 = tr1.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
+        tr0_out, tr0_err = tr0.communicate()
+        tr1_out, tr1_err = tr1.communicate()
+
+        tr0_ret = tr0.returncode
+        tr1_ret = tr0.returncode
+
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        ps0_pipe.close()
+        ps1_pipe.close()
+
+        ps0.terminate()
+        ps1.terminate()
+
+        return 0, 0
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2",
+            #PSERVER
+            "PADDLE_PSERVERS_IP_PORT_LIST": "127.0.0.1:36011,127.0.0.1:36012",
+            #"PADDLE_PSERVER_PORT_ARRAY":"(36011 36012)",
+            "PADDLE_PSERVER_NUMS": "2",
+            "PADDLE_TRAINER_ID": "0",
+            #TRAINER
+            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36013,127.0.0.1:36014",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_PSERVER_ID": "0",
+            #GLOO FLAG
+            "PADDLE_WITH_GLOO": "1",
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        print("path is not delete", os.path.exists("./tmp4"))
+        self.check_with_place(
+            "dist_fleet_debug_gloo.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index 833b7307fa317b171e3acbd3a508a1c8a8da3d94..e7b10be2349cce755267297025ca8520b6d494ee 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -19,10 +19,10 @@ import unittest
 import tempfile
 import shutil
 
+import paddle
 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
 
 # For Net
 base_lr = 0.2
@@ -149,40 +149,41 @@ class TestPSPassWithBow(unittest.TestCase):
         return [avg_cost, acc, cos_q_pt]
 
     def test(self):
-        endpoints = ["127.0.0.1:36004"]
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
-
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         loss, acc, _ = self.net()
-        optimizer = fluid.optimizer.SGD(base_lr)
-        strategy = StrategyFactory.create_async_strategy()
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(loss)
 
-        fleet.startup_program_bak = fleet.startup_program
-        fleet.startup_program = None
+        model_dir = tempfile.mkdtemp()
 
         with self.assertRaises(ValueError):
-            fleet.init_server()
-
-        model_dir = tempfile.mkdtemp()
+            fleet.init_server(os.path.join(model_dir, "temp"), "xxxx")
 
         with self.assertRaises(ValueError):
             fleet.init_server(os.path.join(model_dir, "temp"))
 
-        fleet.startup_program = fleet.startup_program_bak
         fleet.init_server()
 
         from paddle.fluid.communicator import LargeScaleKV
         kv = LargeScaleKV()
-        kv.save("__emb__", os.path.join(model_dir, "__emb__", "__emb__"))
-
-        fleet.main_program = fluid.Program()
+        kv.save("__emb__.block0",
+                os.path.join(model_dir, "__emb__", "__emb__.block0"))
+        fluid.framework.switch_main_program(fluid.Program())
         fleet.init_server(model_dir)
         shutil.rmtree(model_dir)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 761d57408b9a8f9e52419331bfb0bca5b0135c30..1062123948481a4164a12a4bed818b964923006f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
                          ["listen_and_serv"])
         # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
-            "scale"
+            "sum", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "scale"
         ])
 
         # block1~2: optimize pass
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..533ad9604cf0d879371796fb197e61e931fb479f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -0,0 +1,938 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import paddle
+from paddle import fluid
+from paddle.fluid import layers
+from paddle.distribution import *
+import math
+
+
+class DistributionNumpy():
+    def sample(self):
+        raise NotImplementedError
+
+    def entropy(self):
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        raise NotImplementedError
+
+    def probs(self, value):
+        raise NotImplementedError
+
+
+class UniformNumpy(DistributionNumpy):
+    def __init__(self, low, high):
+        self.low = np.array(low).astype('float32')
+        self.high = np.array(high).astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.low + self.high).shape
+        return self.low + (np.random.uniform(size=shape) *
+                           (self.high - self.low))
+
+    def log_prob(self, value):
+        lb = np.less(self.low, value).astype('float32')
+        ub = np.less(value, self.high).astype('float32')
+        return np.log(lb * ub) - np.log(self.high - self.low)
+
+    def probs(self, value):
+        lb = np.less(self.low, value).astype('float32')
+        ub = np.less(value, self.high).astype('float32')
+        return (lb * ub) / (self.high - self.low)
+
+    def entropy(self):
+        return np.log(self.high - self.low)
+
+
+class NormalNumpy(DistributionNumpy):
+    def __init__(self, loc, scale):
+        self.loc = np.array(loc).astype('float32')
+        self.scale = np.array(scale).astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.loc + self.scale).shape
+        return self.loc + (np.random.randn(*shape) * self.scale)
+
+    def log_prob(self, value):
+        var = self.scale * self.scale
+        log_scale = np.log(self.scale)
+        return -((value - self.loc) * (value - self.loc)) / (
+            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+
+    def probs(self, value):
+        var = self.scale * self.scale
+        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
+
+    def entropy(self):
+        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
+            'float32')) + np.log(self.scale)
+
+    def kl_divergence(self, other):
+        var_ratio = (self.scale / other.scale)
+        var_ratio = var_ratio * var_ratio
+        t1 = ((self.loc - other.loc) / other.scale)
+        t1 = (t1 * t1)
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
+
+
+class DistributionTest(unittest.TestCase):
+    def setUp(self, use_gpu=False):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+        self.executor = fluid.Executor(place)
+
+    def build_normal_common_net(self, batch_size, dims, sample_shape, loc_float,
+                                scale_float, other_loc_float, other_scale_float,
+                                scale_np, other_scale_np, loc_np, other_loc_np,
+                                loc, scale, other_loc, other_scale, values):
+        """Generate Normal object and get the output of its methods including
+        ``sample``, ``entropy``, ``log_prob``, ``probs`` and ``kl_divergence``.
+        Parameters ``loc`` and ``scale`` have different data types to test different situations.
+
+        Args:
+          batch_size(int): The first dimension of the shape of parameters(loc and scale).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          loc_float(float): Generated in function ``get_normal_random_input``, loc is a float number.
+          scale_float(float): Generated in function ``get_normal_random_input``, scale is a float number.
+          other_loc_float(float): Generated in function ``get_normal_random_input``, other_loc is a
+            float number. It is the first parameter in another Normal object used in ``kl_divergence``
+            method.
+          other_scale_float(float): Generated in function ``get_normal_random_input``, other_scale is a
+            float number. It is the second parameter in another Normal object used in ``kl_divergence``
+            method.
+          scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
+            whose shape is [batch_size, dims].
+          other_scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_scale_np
+            is an numpy array. It is the second parameter in another Normal object used in ``kl_divergence``
+            method.
+          loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
+            whose shape is [batch_size, dims].
+          other_loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_loc_np
+            is an numpy array. It is the first parameter in another Normal object used in ``kl_divergence``
+            method.
+          loc(Tensor): In dynamic mode, loc is generated in ``build_normal_dygraph``, it's a Tensor filled
+            with ``loc_np`` data. In static mode, loc is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+          scale(Tensor): In dynamic mode, scale is generated in ``build_normal_dygraph``, it's a Tensor filled
+            with ``scale_np`` data. In static mode, scale is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+          other_loc(Tensor): In dynamic mode, other_loc is generated in ``build_normal_dygraph``, it's a Tensor
+            filled with ``other_loc_np`` data. In static mode, other_loc is generated in ``build_normal_static``,
+             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the first parameter
+              in another Normal object used in ``kl_divergence`` method.
+          other_scale(Tensor): In dynamic mode, other_scale is generated in ``build_normal_dygraph``, it's a Tensor
+            filled with ``other_scale_np`` data. In static mode, other_scale is generated in ``build_normal_static``,
+             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the second parameter
+              in another Normal object used in ``kl_divergence`` method.
+          values(Tensor): In dynamic mode, values is generated in ``build_normal_dygraph``, it's a Tensor filled with
+             ``values_np`` data. In static mode, values is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+
+        Returns:
+          List: The elements of the list are the output of sample, entropy, log_prob, probs, kl_divergence methods.
+          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be considered.
+
+        """
+        normal_int = Normal(int(loc_float), int(scale_float))
+        normal_float = Normal(loc_float, scale_float)
+        other_normal_float = Normal(other_loc_float, other_scale_float)
+
+        normal_float_np_broadcast = Normal(loc_float, scale_np)
+        other_normal_float_np_broadcast = Normal(other_loc_float,
+                                                 other_scale_np)
+
+        normal_np = Normal(loc_np, scale_np)
+        other_normal_np = Normal(other_loc_np, other_scale_np)
+
+        normal_variable = Normal(loc, scale)
+        other_normal_variable = Normal(other_loc, other_scale)
+
+        sample_int = normal_int.sample([batch_size, dims])
+        sample_float = normal_float.sample([batch_size, dims])
+        sample_float_np_broadcast = normal_float_np_broadcast.sample(
+            [batch_size, dims])
+        sample_np = normal_np.sample([batch_size, dims])
+        sample_variable = normal_variable.sample([batch_size, dims])
+
+        sample_int_diff = normal_int.sample([sample_shape])
+        sample_float_diff = normal_float.sample([sample_shape])
+        sample_float_np_broadcast_diff = normal_float_np_broadcast.sample(
+            [sample_shape])
+        sample_np_diff = normal_np.sample([sample_shape])
+        sample_variable_diff = normal_variable.sample([sample_shape])
+
+        entropy_int = normal_int.entropy()
+        entropy_float = normal_float.entropy()
+        entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
+        entropy_np = normal_np.entropy()
+        entropy_variable = normal_variable.entropy()
+
+        lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
+        lp_np = normal_np.log_prob(values)
+        lp_variable = normal_variable.log_prob(values)
+
+        p_float_np_broadcast = normal_float_np_broadcast.probs(values)
+        p_np = normal_np.probs(values)
+        p_variable = normal_variable.probs(values)
+
+        kl_float = normal_float.kl_divergence(other_normal_float)
+        kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
+            other_normal_float_np_broadcast)
+        kl_np = normal_np.kl_divergence(other_normal_np)
+        kl_variable = normal_variable.kl_divergence(other_normal_variable)
+
+        fetch_list = [
+            sample_int, sample_float, sample_float_np_broadcast, sample_np,
+            sample_variable, sample_int_diff, sample_float_diff,
+            sample_float_np_broadcast_diff, sample_np_diff,
+            sample_variable_diff, entropy_int, entropy_float,
+            entropy_float_np_broadcast, entropy_np, entropy_variable,
+            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
+            p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
+            kl_variable
+        ]
+        return fetch_list
+
+    def build_normal_static(self, test_program, batch_size, dims, sample_shape,
+                            loc_float, scale_float, other_loc_float,
+                            other_scale_float, scale_np, other_scale_np, loc_np,
+                            other_loc_np, values_np):
+        """
+        In static mode, generate feed data of Normal network, and get output fetch_list using
+        ``build_normal_common_net``.
+
+        Args:
+          test_program: In static mode, the Program object.
+          other args can refer to function ``build_normal_common_net``.
+
+        Returns:
+          feed_vars: The feed data of Normal network in static mode.
+          fetch_list: The output is generated by function ``build_normal_common_net``.
+        """
+        with fluid.program_guard(test_program):
+            loc = layers.data(name='loc', shape=[dims], dtype='float32')
+            scale = layers.data(name='scale', shape=[dims], dtype='float32')
+
+            other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+            values = layers.data(name='values', shape=[dims], dtype='float32')
+
+            fetch_list = self.build_normal_common_net(
+                batch_size, dims, sample_shape, loc_float, scale_float,
+                other_loc_float, other_scale_float, scale_np, other_scale_np,
+                loc_np, other_loc_np, loc, scale, other_loc, other_scale,
+                values)
+
+        feed_vars = {
+            'loc': loc_np,
+            'scale': scale_np,
+            'other_loc': other_loc_np,
+            'other_scale': other_scale_np,
+            'values': values_np
+        }
+        return feed_vars, fetch_list
+
+    def build_normal_dygraph(self, batch_size, dims, sample_shape, loc_float,
+                             scale_float, other_loc_float, other_scale_float,
+                             scale_np, other_scale_np, loc_np, other_loc_np,
+                             values_np):
+        """
+        In dynamic mode, generate input data of Normal network, and get output fetch_list using
+        ``build_normal_common_net``.
+
+        Args:
+          refer to function ``build_normal_common_net``.
+
+        Returns:
+          fetch_list_numpy: The output is generated by function ``build_normal_common_net``. Transform
+          these tensor to numpy.ndarray.
+        """
+        loc = paddle.to_tensor(loc_np)
+        scale = paddle.to_tensor(scale_np)
+        other_loc = paddle.to_tensor(other_loc_np)
+        other_scale = paddle.to_tensor(other_scale_np)
+        values = paddle.to_tensor(values_np)
+
+        fetch_list = self.build_normal_common_net(
+            batch_size, dims, sample_shape, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, loc, scale, other_loc, other_scale, values)
+        fetch_list_numpy = [t.numpy() for t in fetch_list]
+        return fetch_list_numpy
+
+    def get_normal_random_input(self, batch_size, dims):
+        """
+        Generate input data ``loc`` and ``scale`` used in Normal network.
+
+        Args:
+          refer to function ``build_normal_common_net``.
+
+        Returns:
+          List: Different data type of ``loc`` and ``scale``, including float, numpy.ndarray.
+          By the way, ``other_loc`` and ``other_scale`` are used in ``kl_divergence`` method.
+          refer to ``args`` in function ``build_normal_common_net``.
+        """
+        loc_np = np.random.randn(batch_size, dims).astype('float32')
+        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+
+        loc_float = (np.random.ranf() - 0.5) * 4
+        scale_float = (np.random.ranf() - 0.5) * 4
+        while scale_float < 0:
+            scale_float = (np.random.ranf() - 0.5) * 4
+
+        other_loc_float = (np.random.ranf() - 0.5) * 4
+        other_scale_float = (np.random.ranf() - 0.5) * 4
+        while other_scale_float < 0:
+            other_scale_float = (np.random.ranf() - 0.5) * 4
+
+        scale_np = np.random.randn(batch_size, dims).astype('float32')
+        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        while not np.all(scale_np > 0):
+            scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(other_scale_np > 0):
+            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
+        return [
+            loc_np, other_loc_np, loc_float, scale_float, other_loc_float,
+            other_scale_float, scale_np, other_scale_np, values_np
+        ]
+
+    def compare_normal_with_numpy(self,
+                                  data_list,
+                                  output_list,
+                                  batch_size=2,
+                                  dims=3,
+                                  sample_shape=7,
+                                  tolerance=1e-6):
+        """
+        Compare the outputs of Normal's methods in paddle and numpy. If the outputs are not consistent,
+        raise errors.
+
+        Args:
+          data_list: Input data generated by function ``get_normal_random_input``.
+          output_list: The outputs of Normal's methods in static or dynamic mode.
+          batch_size(int): The first dimension of the shape of parameters(loc and scale).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          tolerance(float): The tolerance of the error.
+        """
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
+        np_normal_float = NormalNumpy(loc_float, scale_float)
+        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
+        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
+        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
+                                                         other_scale_np)
+        np_normal = NormalNumpy(loc_np, scale_np)
+        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
+
+        gt_sample_int = np_normal_int.sample([batch_size, dims])
+        gt_sample_float = np_normal_float.sample([batch_size, dims])
+        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
+            [batch_size, dims])
+        gt_sample_np = np_normal.sample([batch_size, dims])
+
+        gt_sample_int_diff = np_normal_int.sample([sample_shape])
+        gt_sample_float_diff = np_normal_float.sample([sample_shape])
+        gt_sample_float_np_broadcast_diff = np_normal_float_np_broadcast.sample(
+            [sample_shape])
+        gt_sample_np_diff = np_normal.sample([sample_shape])
+
+        gt_entropy_int = np_normal_int.entropy()
+        gt_entropy_float = np_normal_float.entropy()
+        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
+        gt_entropy = np_normal.entropy()
+        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
+            values_np)
+        gt_lp = np_normal.log_prob(values_np)
+        gt_p_float_np_broadcast = np_normal_float_np_broadcast.probs(values_np)
+        gt_p = np_normal.probs(values_np)
+        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
+        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
+            np_other_normal_float_np_broadcast)
+        gt_kl = np_normal.kl_divergence(np_other_normal)
+
+        [
+            output_sample_int, output_sample_float,
+            output_sample_float_np_broadcast, output_sample_np,
+            output_sample_variable, output_sample_int_diff,
+            output_sample_float_diff, output_sample_float_np_broadcast_diff,
+            output_sample_np_diff, output_sample_variable_diff,
+            output_entropy_int, output_entropy_float,
+            output_entropy_float_np_broadcast, output_entropy_np,
+            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
+            output_lp_variable, output_p_float_np_broadcast, output_p_np,
+            output_p_variable, output_kl_float, output_kl_float_np_broadcast,
+            output_kl_np, output_kl_variable
+        ] = output_list
+
+        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
+        np.testing.assert_equal(output_sample_float.shape,
+                                gt_sample_float.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
+                                gt_sample_float_np_broadcast.shape)
+        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_variable.shape,
+                                gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_int_diff.shape,
+                                gt_sample_int_diff.shape)
+        np.testing.assert_equal(output_sample_float_diff.shape,
+                                gt_sample_float_diff.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
+                                gt_sample_float_np_broadcast_diff.shape)
+        np.testing.assert_equal(output_sample_np_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_equal(output_sample_variable_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_allclose(
+            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float,
+            gt_entropy_float,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float_np_broadcast,
+            gt_entropy_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_float_np_broadcast,
+            gt_lp_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_float_np_broadcast,
+            gt_p_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_float_np_broadcast,
+            gt_kl_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
+
+    def test_normal_distribution_static(self,
+                                        batch_size=2,
+                                        dims=3,
+                                        sample_shape=7,
+                                        tolerance=1e-6):
+        """
+        Test Normal's methods in static mode.
+
+        Args:
+          refer to ``compare_normal_with_numpy`` function.
+        """
+        test_program = fluid.Program()
+        data_list = self.get_normal_random_input(batch_size, dims)
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        feed_vars, fetch_list = self.build_normal_static(
+            test_program, batch_size, dims, sample_shape, loc_float,
+            scale_float, other_loc_float, other_scale_float, scale_np,
+            other_scale_np, loc_np, other_loc_np, values_np)
+        self.executor.run(fluid.default_startup_program())
+
+        output_list = self.executor.run(program=test_program,
+                                        feed=feed_vars,
+                                        fetch_list=fetch_list)
+
+        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
+                                       sample_shape, tolerance)
+
+    def test_normal_distribution_dygraph(self,
+                                         batch_size=2,
+                                         dims=3,
+                                         sample_shape=7,
+                                         tolerance=1e-6):
+        """
+        Test Normal's methods in dynamic mode.
+
+        Args:
+          refer to ``compare_normal_with_numpy`` function.
+        """
+        paddle.disable_static()
+        data_list = self.get_normal_random_input(batch_size, dims)
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        output_list = self.build_normal_dygraph(
+            batch_size, dims, sample_shape, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, values_np)
+
+        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
+                                       sample_shape, tolerance)
+        paddle.enable_static()
+
+    def build_uniform_common_net(self, batch_size, dims, sample_shape,
+                                 low_float, high_float, high_np, low_np,
+                                 values_np, low, high, values):
+        """Generate Uniform object and get the output of its methods including ``sample``, ``entropy``,
+         ``log_prob`` and ``probs``.
+        Parameters ``low`` and ``high`` have different data types to test different situations.
+
+        Args:
+          batch_size(int): The first dimension of the shape of parameters(low and high).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          low_float(float): Parameter ``low`` is a float number.
+          high_float(float): Parameter ``high`` is a float number.
+          high_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
+          low_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
+          values_np(numpy.ndarray): The input of ``log_prob`` and ``probs`` methods. An numpy array whose
+            shape is [batch_size, dims].
+          low(Tensor): In dynamic mode, low is generated in ``build_uniform_dygraph``, it's a Tensor filled
+            with ``low_np`` data. In static mode, low is generated in ``build_uniform_static``.
+          high(Tensor): In dynamic mode, high is generated in ``build_uniform_dygraph``, it's a Tensor filled
+            with ``high_np`` data. In static mode, high is generated in ``build_uniform_static``.
+          values(Tensor): In dynamic mode, values is generated in ``build_uniform_dygraph``, it's a Tensor
+            filled with ``values_np`` data. In static mode, values is generated in ``build_uniform_static``.
+
+        Returns:
+          List: The elements of the list are the output of sample, entropy, log_prob, probs methods.
+          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be
+           considered.
+
+        """
+        uniform_int = Uniform(int(low_float), int(high_float))
+        uniform_float = Uniform(low_float, high_float)
+        uniform_float_np_broadcast = Uniform(low_float, high_np)
+        uniform_np = Uniform(low_np, high_np)
+        uniform_variable = Uniform(low, high)
+
+        sample_int = uniform_int.sample([batch_size, dims])
+        sample_float = uniform_float.sample([batch_size, dims])
+        sample_float_np_broadcast = uniform_float_np_broadcast.sample(
+            [batch_size, dims])
+        sample_np = uniform_np.sample([batch_size, dims])
+        sample_variable = uniform_variable.sample([batch_size, dims])
+
+        sample_int_diff = uniform_int.sample([sample_shape])
+        sample_float_diff = uniform_float.sample([sample_shape])
+        sample_float_np_broadcast_diff = uniform_float_np_broadcast.sample(
+            [sample_shape])
+        sample_np_diff = uniform_np.sample([sample_shape])
+        sample_variable_diff = uniform_variable.sample([sample_shape])
+
+        entropy_int = uniform_int.entropy()
+        entropy_float = uniform_float.entropy()
+        entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
+        entropy_np = uniform_np.entropy()
+        entropy_variable = uniform_variable.entropy()
+
+        lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
+        lp_np = uniform_np.log_prob(values)
+        lp_variable = uniform_variable.log_prob(values)
+
+        p_float_np_broadcast = uniform_float_np_broadcast.probs(values)
+        p_np = uniform_np.probs(values)
+        p_variable = uniform_variable.probs(values)
+
+        fetch_list = [
+            sample_int, sample_float, sample_float_np_broadcast, sample_np,
+            sample_variable, sample_int_diff, sample_float_diff,
+            sample_float_np_broadcast_diff, sample_np_diff,
+            sample_variable_diff, entropy_int, entropy_float,
+            entropy_float_np_broadcast, entropy_np, entropy_variable,
+            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
+            p_np, p_variable
+        ]
+        return fetch_list
+
+    def build_uniform_static(self, test_program, batch_size, dims, sample_shape,
+                             low_float, high_float, high_np, low_np, values_np):
+        """
+        In static mode, generate feed data of Uniform network, and get output fetch_list using
+        ``build_uniform_common_net``.
+
+        Args:
+          test_program: In static mode, the Program object.
+          other args can refer to function ``build_uniform_common_net``.
+
+        Returns:
+          feed_vars: The feed data of Uniform network in static mode.
+          fetch_list: The output is generated by function ``build_uniform_common_net``.
+        """
+        with fluid.program_guard(test_program):
+            low = layers.data(name='low', shape=[dims], dtype='float32')
+            high = layers.data(name='high', shape=[dims], dtype='float32')
+
+            values = layers.data(name='values', shape=[dims], dtype='float32')
+
+            fetch_list = self.build_uniform_common_net(
+                batch_size, dims, sample_shape, low_float, high_float, high_np,
+                low_np, values_np, low, high, values)
+
+        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
+        return feed_vars, fetch_list
+
+    def build_uniform_dygraph(self, batch_size, dims, sample_shape, low_float,
+                              high_float, high_np, low_np, values_np):
+        """
+        In dynamic mode, generate input data of Uniform network, and get output fetch_list using
+        ``build_uniform_common_net``.
+
+        Args:
+          refer to function ``build_uniform_common_net``.
+
+        Returns:
+          fetch_list_numpy: The output is generated by function ``build_uniform_common_net``. Transform
+          these tensor to numpy.ndarray.
+        """
+        low = paddle.to_tensor(low_np)
+        high = paddle.to_tensor(high_np)
+        values = paddle.to_tensor(values_np)
+
+        fetch_list = self.build_uniform_common_net(
+            batch_size, dims, sample_shape, low_float, high_float, high_np,
+            low_np, values_np, low, high, values)
+        fetch_list_numpy = [t.numpy() for t in fetch_list]
+        return fetch_list_numpy
+
+    def compare_uniform_with_numpy(self,
+                                   data_list,
+                                   output_list,
+                                   batch_size=2,
+                                   dims=3,
+                                   sample_shape=7,
+                                   tolerance=1e-6):
+        """
+        Compare the outputs of Uniform's methods in paddle and numpy. If the outputs are not consistent,
+        raise errors.
+
+        Args:
+          data_list: Input data including float and numpy.ndarray type of ``low`` and ``high`` parameters.
+          output_list: The outputs of Uniform's methods in static or dynamic mode.
+          batch_size(int): The first dimension of the shape of parameters(low and high).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          tolerance(float): The tolerance of the error.
+        """
+        [low_np, low_float, high_float, high_np, values_np] = data_list
+
+        np_uniform_int = UniformNumpy(int(low_float), int(high_float))
+        np_uniform_float = UniformNumpy(low_float, high_float)
+        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
+        np_uniform = UniformNumpy(low_np, high_np)
+
+        gt_sample_int = np_uniform_int.sample([batch_size, dims])
+        gt_sample_float = np_uniform_float.sample([batch_size, dims])
+        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
+            [batch_size, dims])
+        gt_sample_np = np_uniform.sample([batch_size, dims])
+        gt_sample_int_diff = np_uniform_int.sample([sample_shape])
+        gt_sample_float_diff = np_uniform_float.sample([sample_shape])
+        gt_sample_float_np_broadcast_diff = np_uniform_float_np_broadcast.sample(
+            [sample_shape])
+        gt_sample_np_diff = np_uniform.sample([sample_shape])
+        gt_entropy_int = np_uniform_int.entropy()
+        gt_entropy_float = np_uniform_float.entropy()
+        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
+        gt_entropy = np_uniform.entropy()
+        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
+            values_np)
+        gt_lp = np_uniform.log_prob(values_np)
+        gt_p_float_np_broadcast = np_uniform_float_np_broadcast.probs(values_np)
+        gt_p = np_uniform.probs(values_np)
+
+        [
+            output_sample_int, output_sample_float,
+            output_sample_float_np_broadcast, output_sample_np,
+            output_sample_variable, output_sample_int_diff,
+            output_sample_float_diff, output_sample_float_np_broadcast_diff,
+            output_sample_np_diff, output_sample_variable_diff,
+            output_entropy_int, output_entropy_float,
+            output_entropy_float_np_broadcast, output_entropy_np,
+            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
+            output_lp_variable, output_p_float_np_broadcast, output_p_np,
+            output_p_variable
+        ] = output_list
+
+        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
+        np.testing.assert_equal(output_sample_float.shape,
+                                gt_sample_float.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
+                                gt_sample_float_np_broadcast.shape)
+        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_variable.shape,
+                                gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_int_diff.shape,
+                                gt_sample_int_diff.shape)
+        np.testing.assert_equal(output_sample_float_diff.shape,
+                                gt_sample_float_diff.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
+                                gt_sample_float_np_broadcast_diff.shape)
+        np.testing.assert_equal(output_sample_np_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_equal(output_sample_variable_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_allclose(
+            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float,
+            gt_entropy_float,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float_np_broadcast,
+            gt_entropy_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_float_np_broadcast,
+            gt_lp_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_float_np_broadcast,
+            gt_p_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
+
+    def test_uniform_distribution_static(self,
+                                         batch_size=2,
+                                         dims=3,
+                                         sample_shape=7,
+                                         tolerance=1e-6):
+        """
+        Test Uniform's methods in static mode.
+
+        Args:
+          refer to ``compare_uniform_with_numpy`` function.
+        """
+        test_program = fluid.Program()
+
+        low_np = np.random.randn(batch_size, dims).astype('float32')
+        low_float = np.random.uniform(-2, 1)
+        high_float = np.random.uniform(1, 3)
+        high_np = np.random.uniform(-5.0, 5.0,
+                                    (batch_size, dims)).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        data_list = [low_np, low_float, high_float, high_np, values_np]
+
+        feed_vars, fetch_list = self.build_uniform_static(
+            test_program, batch_size, dims, sample_shape, low_float, high_float,
+            high_np, low_np, values_np)
+
+        self.executor.run(fluid.default_startup_program())
+
+        # result calculated by paddle
+        output_list = self.executor.run(program=test_program,
+                                        feed=feed_vars,
+                                        fetch_list=fetch_list)
+        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
+                                        dims, sample_shape, tolerance)
+
+    def test_uniform_distribution_dygraph(self,
+                                          batch_size=2,
+                                          dims=3,
+                                          sample_shape=7,
+                                          tolerance=1e-6):
+        """
+        Test Uniform's methods in dynamic mode.
+
+        Args:
+          refer to ``compare_uniform_with_numpy`` function.
+        """
+        paddle.disable_static()
+
+        low_np = np.random.randn(batch_size, dims).astype('float32')
+        low_float = np.random.uniform(-2, 1)
+        high_float = np.random.uniform(1, 3)
+        high_np = np.random.uniform(-5.0, 5.0,
+                                    (batch_size, dims)).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        data_list = [low_np, low_float, high_float, high_np, values_np]
+        output_list = self.build_uniform_dygraph(batch_size, dims, sample_shape,
+                                                 low_float, high_float, high_np,
+                                                 low_np, values_np)
+
+        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
+                                        dims, sample_shape, tolerance)
+        paddle.enable_static()
+
+
+class DistributionTestError(unittest.TestCase):
+    def test_distribution_error(self):
+        distribution = Distribution()
+
+        self.assertRaises(NotImplementedError, distribution.sample)
+        self.assertRaises(NotImplementedError, distribution.entropy)
+
+        normal = Normal(0.0, 1.0)
+        self.assertRaises(NotImplementedError, distribution.kl_divergence,
+                          normal)
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        self.assertRaises(NotImplementedError, distribution.log_prob,
+                          value_tensor)
+        self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
+
+    def test_normal_error(self):
+        normal = Normal(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, normal.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, normal.sample, [2, 3], seed)
+
+        normal_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Normal
+        self.assertRaises(TypeError, normal.kl_divergence, normal_other)
+
+    def test_uniform_error(self):
+        uniform = Uniform(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, uniform.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
+
+
+class DistributionTestName(unittest.TestCase):
+    def get_prefix(self, string):
+        return (string.split('.')[0])
+
+    def test_normal_name(self):
+        name = 'test_normal'
+        normal1 = Normal(0.0, 1.0, name=name)
+        self.assertEqual(normal1.name, name)
+
+        normal2 = Normal(0.0, 1.0)
+        self.assertEqual(normal2.name, 'Normal')
+
+        paddle.enable_static()
+
+        sample = normal1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = normal1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = normal1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = normal1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        kl = normal1.kl_divergence(normal2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+    def test_uniform_name(self):
+        name = 'test_uniform'
+        uniform1 = Uniform(0.0, 1.0, name=name)
+        self.assertEqual(uniform1.name, name)
+
+        uniform2 = Uniform(0.0, 1.0)
+        self.assertEqual(uniform2.name, 'Uniform')
+
+        paddle.enable_static()
+
+        sample = uniform1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = uniform1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = uniform1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = uniform1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index cc3910d1b0c828f572f5e618b7aa9c55ecb93987..d18c8e25974441a6989b18a0fe13bac91251de9d 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -236,5 +237,501 @@ class TestDropoutOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestDropoutFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.dropout(x=input, p=0., training=False)
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='downscale_in_infer')
+            res4 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=0,
+                training=False,
+                mode='downscale_in_infer')
+            res6 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train')
+            res7 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='downscale_in_infer')
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train')
+            res9 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='downscale_in_infer')
+            res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+            res_np2 = np.zeros_like(in_np)
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches2 = exe.run(fluid.default_main_program(),
+                               feed={"input": in_np},
+                               fetch_list=[res10])
+            self.assertTrue(np.allclose(fetches2[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([40, 40]).astype("float32")
+                res_np = in_np
+                res_np2 = np.zeros_like(in_np)
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout(
+                    x=input, p=0., training=False)
+                res2 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=True,
+                    mode='upscale_in_train')
+                res3 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=True,
+                    mode='downscale_in_infer')
+                res4 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=False,
+                    mode='upscale_in_train')
+                res5 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=False,
+                    mode='downscale_in_infer')
+                res6 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=True,
+                    mode='upscale_in_train')
+                res7 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=True,
+                    mode='downscale_in_infer')
+                res8 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=False,
+                    mode='upscale_in_train')
+                res9 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=False,
+                    mode='downscale_in_infer')
+                res10 = paddle.nn.functional.dropout(
+                    x=input, p=1., training=True)
+
+            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+            self.assertTrue(np.allclose(res10.numpy(), res_np2))
+
+
+class TestDropoutFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.dropout(x1, p=0.5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_Variable2():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.dropout(x1, p=0.5, axis=0)
+
+            self.assertRaises(TypeError, test_Variable2)
+
+            def test_dtype():
+                # the input dtype of dropout must be float32 or float64
+                # float16 only can be set on GPU place
+                xr = fluid.data(name='xr', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout(xr, p=0.5)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_pdtype():
+                # p should be int or float
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, p='0.5')
+
+            self.assertRaises(TypeError, test_pdtype)
+
+            def test_pvalue():
+                # p should be 0.<=p<=1.
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, p=1.2)
+
+            self.assertRaises(ValueError, test_pvalue)
+
+            def test_mode():
+                # mode should be 'downscale_in_infer' or 'upscale_in_train'
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, mode='abc')
+
+            self.assertRaises(ValueError, test_mode)
+
+            def test_axis():
+                # axis should be int or list
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=1.2)
+
+            self.assertRaises(TypeError, test_axis)
+
+            def test_axis_max():
+                # maximum of axis should less than dimensions of x
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, 5])
+
+            self.assertRaises(ValueError, test_axis_max)
+
+            def test_axis_len():
+                # length of axis should not greater than dimensions of x
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, 1, 2, 3, 4])
+
+            self.assertRaises(ValueError, test_axis_len)
+
+
+class TestDropoutCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([40, 40]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestDropout2dFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5], dtype="float32")
+            res1 = paddle.nn.functional.dropout2d(
+                x=input, p=0., training=False, data_format='NCHW')
+            res2 = paddle.nn.functional.dropout2d(
+                x=input, p=0., training=False, data_format='NHWC')
+
+            in_np = np.random.random([2, 3, 4, 5]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([2, 3, 4, 5]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout2d(
+                    x=input, p=0., training=False, data_format='NCHW')
+                res2 = paddle.nn.functional.dropout2d(
+                    x=input, p=0., training=False, data_format='NHWC')
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestDropout2dFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_xdim():
+                # dimentions of x should be 4
+                x = fluid.data(name='x1', shape=[2, 3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout2d(x)
+
+            self.assertRaises(ValueError, test_xdim)
+
+            def test_dataformat():
+                # data_format should be 'NCHW' or 'NHWC'
+                x = fluid.data(name='x2', shape=[2, 3, 4, 5], dtype="int32")
+                paddle.nn.functional.dropout2d(x, data_format='CNHW')
+
+            self.assertRaises(ValueError, test_dataformat)
+
+
+class TestDropout2DCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 4, 5]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout2D(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestDropout3dFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5, 6], dtype="float32")
+            res1 = paddle.nn.functional.dropout3d(
+                x=input, p=0., training=False, data_format='NCDHW')
+            res2 = paddle.nn.functional.dropout3d(
+                x=input, p=0., training=False, data_format='NDHWC')
+
+            in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout3d(
+                    x=input, p=0., training=False, data_format='NCDHW')
+                res2 = paddle.nn.functional.dropout3d(
+                    x=input, p=0., training=False, data_format='NDHWC')
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestDropout3dFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_xdim():
+                # dimentions of x should be 5
+                x = fluid.data(name='x1', shape=[2, 3, 4, 5], dtype="int32")
+                paddle.nn.functional.dropout3d(x)
+
+            self.assertRaises(ValueError, test_xdim)
+
+            def test_dataformat():
+                # data_format should be 'NCDHW' or 'NDHWC'
+                x = fluid.data(name='x2', shape=[2, 3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout3d(x, data_format='CNDHW')
+
+            self.assertRaises(ValueError, test_dataformat)
+
+
+class TestDropout3DCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout3D(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestAlphaDropoutFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
+            res2 = paddle.nn.functional.alpha_dropout(
+                x=input, p=0., training=False)
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([40, 40]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
+                res2 = paddle.nn.functional.alpha_dropout(
+                    x=input, p=0., training=False)
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestAlphaDropoutFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.alpha_dropout(x1, p=0.5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_dtype():
+                # the input dtype of dropout must be float32 or float64
+                xr = fluid.data(name='xr', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.alpha_dropout(xr)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_pdtype():
+                # p should be int or float
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.alpha_dropout(x2, p='0.5')
+
+            self.assertRaises(TypeError, test_pdtype)
+
+            def test_pvalue():
+                # p should be 0.<=p<=1.
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.alpha_dropout(x2, p=1.2)
+
+            self.assertRaises(ValueError, test_pvalue)
+
+
+class TestAlphaDropoutCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([40, 40]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.AlphaDropout(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..466226c53fabbd315acd19c6421f210d0ca225c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import collections
+from functools import reduce
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn.utils import weight_norm, remove_weight_norm
+
+
+class TestDygraphWeightNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+        self.set_data()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = None
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_value = numpy.random.random(
+                size=[self.batch_size] + data_shape).astype('float32')
+            self.data[data_name] = data_value
+
+    def norm_except_dim(self, w, dim=None):
+        shape = w.shape
+        ndims = len(shape)
+        shape_numel = reduce(lambda x, y: x * y, shape)
+        if dim == -1:
+            return numpy.linalg.norm(w, axis=None, keepdims=True)
+        elif dim == 0:
+            tile_shape = list(w.shape)
+            tile_shape[0] = 1
+            w_matrix = numpy.reshape(w, (shape[0], shape_numel // shape[0]))
+            return numpy.linalg.norm(w_matrix, axis=1, keepdims=True)
+        elif dim == (ndims - 1):
+            w_matrix = numpy.reshape(w, (shape_numel // shape[-1], shape[-1]))
+            return numpy.linalg.norm(w_matrix, axis=0, keepdims=True)
+        else:
+            perm = list(range(ndims))
+            perm_ori = list(range(ndims))
+            perm[0] = dim
+            perm[dim] = 0
+            p_transposed = numpy.transpose(w, perm)
+            return self.norm_except_dim(p_transposed, 0)
+
+    def weight_normalize(self, w, dim=None):
+        shape = w.shape
+        ndims = len(shape)
+        shape_numel = reduce(lambda x, y: x * y, shape)
+        v = w
+        g = self.norm_except_dim(w, dim)
+        g_mul = g
+
+        if dim == -1:
+            v_norm = v / (numpy.linalg.norm(v, axis=None, keepdims=True))
+        elif dim == 0:
+            w_matrix = numpy.reshape(w, (shape[0], shape_numel // shape[0]))
+            v_norm = v / numpy.linalg.norm(w_matrix, axis=1)
+            v_norm = numpy.reshape(v_norm, shape)
+            g = numpy.squeeze(g, axis=1)
+        elif dim == (ndims - 1):
+            w_matrix = numpy.reshape(w, (shape_numel // shape[-1], shape[-1]))
+            v_norm = v / numpy.linalg.norm(w_matrix, axis=0, keepdims=True)
+            v_norm = numpy.reshape(v_norm, shape)
+        else:
+            perm = list(range(ndims))
+            perm[0] = dim
+            perm[dim] = 0
+            p_transposed = numpy.transpose(v, perm)
+            transposed_shape = p_transposed.shape
+            transposed_shape_numel = reduce(lambda x, y: x * y,
+                                            transposed_shape)
+            p_matrix = numpy.reshape(
+                p_transposed, (p_transposed.shape[0],
+                               transposed_shape_numel // p_transposed.shape[0]))
+            v_norm = v / numpy.expand_dims(
+                numpy.expand_dims(
+                    numpy.linalg.norm(
+                        p_matrix, axis=1, keepdims=True), axis=0),
+                axis=(ndims - 1))
+            v_norm = numpy.reshape(v_norm, transposed_shape)
+            v_norm = numpy.transpose(v_norm, perm)
+            g = numpy.squeeze(g, axis=1)
+            if dim == 1:
+                eaxis = 2
+            elif dim == 2:
+                eaxis = 1
+            g_mul = numpy.expand_dims(
+                numpy.expand_dims(
+                    numpy.expand_dims(
+                        g, axis=0), axis=eaxis),
+                axis=(ndims - 1))
+        w = g_mul * v_norm
+        return g, v
+
+    def test_check_output(self):
+        fluid.enable_imperative()
+        linear = paddle.nn.Conv2d(2, 3, 3)
+        before_weight = linear.weight.numpy()
+        if self.dim == None:
+            self.dim = -1
+        wn = weight_norm(linear, dim=self.dim)
+        outputs = []
+        for name, data in self.data.items():
+            output = linear(fluid.dygraph.to_variable(data))
+            outputs.append(output.numpy())
+        after_weight = linear.weight
+        self.actual_outputs = [linear.weight_g.numpy(), linear.weight_v.numpy()]
+
+        expect_output = self.weight_normalize(before_weight, self.dim)
+
+        for expect, actual in zip(expect_output, self.actual_outputs):
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual), expect, atol=0.001))
+
+
+class TestDygraphWeightNormCase1(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 0
+
+
+class TestDygraphWeightNormCase2(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 1
+
+
+class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 3
+
+
+class TestDygraphRemoveWeightNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = None
+
+    def test_check_output(self):
+        fluid.enable_imperative()
+        linear = paddle.nn.Conv2d(2, 3, 3)
+        before_weight = linear.weight
+        wn = weight_norm(linear, dim=self.dim)
+        rwn = remove_weight_norm(linear)
+        after_weight = linear.weight
+        self.assertTrue(
+            numpy.allclose(
+                before_weight.numpy(), after_weight.numpy(), atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 6eeb355a6ba3a9c20156ebfd1389d50e92a5a0f5..c941d7c5f34352ac0e762403d0e7e3f0238cbe36 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -397,7 +397,7 @@ class TestAddOp(unittest.TestCase):
             y_1 = paddle.add(x, y, name='add_res')
             self.assertEqual(('add_res' in y_1.name), True)
 
-    def test_alpha(self):
+    def test_declarative(self):
         with fluid.program_guard(fluid.Program()):
 
             def gen_data():
@@ -408,33 +408,12 @@ class TestAddOp(unittest.TestCase):
 
             x = fluid.data(name="x", shape=[3], dtype='float32')
             y = fluid.data(name="y", shape=[3], dtype='float32')
-            z = paddle.add(x, y, alpha=10)
+            z = paddle.add(x, y)
 
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
-            z_expected = np.array([12., 53., 24.])
-            self.assertEqual((z_value == z_expected).all(), True)
-
-    def test_alpha_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        with fluid.program_guard(fluid.Program()):
-
-            def gen_data():
-                return {
-                    "x": np.array([2, 3, 4]).astype('float32'),
-                    "y": np.array([1, 5, 2]).astype('float32')
-                }
-
-            x = fluid.data(name="x", shape=[3], dtype='float32')
-            y = fluid.data(name="y", shape=[3], dtype='float32')
-            z = paddle.add(x, y, alpha=-0.5)
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
-            z_expected = np.array([1.5, 0.5, 3.])
+            z_expected = np.array([3., 8., 6.])
             self.assertEqual((z_value == z_expected).all(), True)
 
     def test_dygraph(self):
@@ -443,9 +422,9 @@ class TestAddOp(unittest.TestCase):
             np_y = np.array([1, 5, 2]).astype('float64')
             x = fluid.dygraph.to_variable(np_x)
             y = fluid.dygraph.to_variable(np_y)
-            z = paddle.add(x, y, alpha=-0.5)
+            z = paddle.add(x, y)
             np_z = z.numpy()
-            z_expected = np.array([1.5, 0.5, 3.])
+            z_expected = np.array([3., 8., 6.])
             self.assertEqual((np_z == z_expected).all(), True)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index de0fc591b664728387ccb988f3611fe034989627..9ebaf8ff9438be8c8a57815be0798b861d05caaf 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -240,25 +240,124 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
             self.assertEqual((out_result == (2 / x)).all(), True)
 
 
-class TestDivOp(unittest.TestCase):
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+class TestDivideAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+        # rule 2: both the inputs are not Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 2
+            y = 4
+            res = paddle.divide(x, y)
+            exe = fluid.Executor(place)
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={},
+                           fetch_list=[res])
+            self.assertEqual(np_z[0] == 0.5, True)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x / y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 1.5, 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = y / x
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 8, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 0.25, 0.5])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x / y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([2, 3, 4]).astype('float64'),
+                               "y": np.array([1, 5, 2]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([2., 0.6, 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
 
-            y_1 = paddle.div(x, y, name='div_res')
-            self.assertEqual(('div_res' in y_1.name), True)
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
 
     def test_dygraph(self):
-        with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.div(x, y)
-            np_z = z.numpy()
-            z_expected = np.array([2., 0.6, 2.])
-            self.assertEqual((np_z == z_expected).all(), True)
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
+
+                # rule 2: both the inputs are not Tensor
+                z = paddle.divide(3, 2)
+                self.assertEqual(z.numpy()[0] == 1.5, True)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x / y
+                z_expected = np.array([1., 1.5, 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 1, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = y / x
+                z_expected = np.array([1., 2., 0.5])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x / y
+                z_expected = np.array([2., 0.6, 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 104e896b6e440f5657a90e0ce741b49f72ba75c6..0b6acc7615395ed99a484e0e56f9c62447a1f345 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
 
@@ -56,6 +58,13 @@ class TestElementwiseModOp(OpTest):
         pass
 
 
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
@@ -65,5 +74,146 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
+class TestFloorDivideAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+        # rule 2: both the inputs are not Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 2
+            y = 4
+            res = paddle.floor_divide(x, y)
+            exe = fluid.Executor(place)
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={},
+                           fetch_list=[res])
+            self.assertEqual(np_z[0] == 0., True)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x // y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 1., 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = y // x
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 8, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 0., 0.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x // y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([2, 3, 4]).astype('float64'),
+                               "y": np.array([1, 5, 2]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([2., 0., 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
+
+                # rule 2: both the inputs are not Tensor
+                z = paddle.floor_divide(3, 2)
+                self.assertEqual(z.numpy()[0] == 1., True)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x // y
+                z_expected = np.array([1, 1, 2])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 1, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = y // x
+                z_expected = np.array([1, 2, 0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x // y
+                z_expected = np.array([2., 0., 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            # divide by zero 
+            np_x = np.array([2, 3, 4])
+            np_y = np.array([0])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            try:
+                z = x // y
+            except Exception as e:
+                print("Error: Divide by zero encounter in floor_divide\n")
+
+            # divide by zero 
+            np_x = np.array([2])
+            np_y = np.array([0, 0, 0])
+            x = paddle.to_tensor(np_x, dtype="int32")
+            y = paddle.to_tensor(np_y, dtype="int32")
+            try:
+                z = x // y
+            except Exception as e:
+                print("Error: Divide by zero encounter in floor_divide\n")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index 2c0fdf51769782e046b1b18ebd31782c81fd49f0..f5d8b4f704da8acd97475444346522f63d3724fd 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
 
@@ -82,5 +84,142 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
         self.dtype = np.float64
 
 
+class TestRemainderAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([0., 1., 0.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 3
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[1], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([1., 2., 4]).astype('float64'),
+                               "y": np.array([1.5]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([1., 0.5, 1.0])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[6], dtype="float64")
+            y = fluid.data(name="y", shape=[1], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(
+                fluid.default_main_program(),
+                feed={
+                    "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
+                    "y": np.array([2]).astype('float64')
+                },
+                fetch_list=[res])
+            z_expected = np.array([1., 0., 1., 1., 0., 1.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x % y
+                z_expected = np.array([0, 1, 0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([1., 2., 4])
+                np_y = np.array([1.5])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([1., 0.5, 1.0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([-3., -2, -1, 1, 2, 3])
+                np_y = np.array([2.])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([1., 0., 1., 1., 0., 1.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                np_x = np.array([-3.3, 11.5, -2, 3.5])
+                np_y = np.array([-1.2, 2., 3.3, -2.3])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+                np_x = np.array([-3, 11, -2, 3])
+                np_y = np.array([-1, 2, 3, -2])
+                x = paddle.to_tensor(np_x, dtype="int64")
+                y = paddle.to_tensor(np_y, dtype="int64")
+                z = x % y
+                z_expected = np.array([0, 1, 1, -1])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index e86f18a62167b7feab1549072fc296f847c00491..12b75c8bf703d2b31e6abb08bb233fb2874828ce 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -29,7 +29,7 @@ class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -56,7 +56,7 @@ class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -83,7 +83,7 @@ class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -110,7 +110,7 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -137,7 +137,7 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -164,7 +164,7 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -191,7 +191,7 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.0001
         dtype = np.float64
 
@@ -219,7 +219,7 @@ class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.0001
         dtype = np.float64
 
diff --git a/python/paddle/fluid/tests/unittests/test_erf_op.py b/python/paddle/fluid/tests/unittests/test_erf_op.py
index 93ab0212f136adfedacb52a2fde47e15edf279d3..964e704c6a2ccbdc96fc281f6e417caf8351cdf7 100644
--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
@@ -19,6 +19,7 @@ import numpy as np
 from scipy.special import erf
 from op_test import OpTest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
 
@@ -58,6 +59,12 @@ class TestErfLayer(unittest.TestCase):
         if fluid.is_compiled_with_cuda():
             self._test_case(fluid.CUDAPlace(0))
 
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.erf(x, name='erf')
+            self.assertTrue('erf' in y.name)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b1e3c5a28a5498d1a06654ea0a4ddcac6c7592b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
@@ -0,0 +1,84 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+class TestExecutor(unittest.TestCase):
+    def net(self):
+        lr = fluid.data(name="lr", shape=[1], dtype='float32')
+        x = fluid.data(name="x", shape=[None, 1], dtype='float32')
+        y = fluid.data(name="y", shape=[None, 1], dtype='float32')
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+
+        opt = fluid.optimizer.Adam(learning_rate=lr)
+        opt.minimize(avg_cost)
+
+        return lr, avg_cost
+
+    def test_program_check_feed(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.scope_guard(scope):
+                cpu = fluid.CPUPlace()
+                exe = fluid.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                train_data = [[1.0], [2.0], [3.0], [4.0]]
+                y_true = [[2.0], [4.0], [6.0], [8.0]]
+                a = 0
+                with self.assertRaises(ValueError):
+                    exe.run(feed={'x': train_data,
+                                  'lr': a},
+                            fetch_list=[lr, cost],
+                            return_numpy=False,
+                            use_prune=True)
+
+    def test_compiled_program_check_feed(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.scope_guard(scope):
+                cpu = fluid.CPUPlace()
+                exe = fluid.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                compiled_prog = fluid.CompiledProgram(
+                    main_program).with_data_parallel(loss_name=cost.name)
+                train_data = [[1.0], [2.0], [3.0], [4.0]]
+                y_true = [[2.0], [4.0], [6.0], [8.0]]
+                a = 0
+                with self.assertRaises(ValueError):
+                    exe.run(compiled_prog,
+                            feed={'x': train_data,
+                                  'lr': a},
+                            fetch_list=[lr, cost],
+                            return_numpy=False,
+                            use_prune=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..4bc6bf3744f26cf7618d255f306bdb8f5fefb7a0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestExpandAsOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(100).astype("float64")
+        target_tensor = np.random.rand(2, 100).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [2, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(10, 12).astype("float64")
+        target_tensor = np.random.rand(10, 12).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(2, 3, 20).astype("float64")
+        target_tensor = np.random.rand(2, 3, 20).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [1, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("float64")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsV2Error(unittest.TestCase):
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype="uint8")
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
+            self.assertRaises(TypeError, paddle.tensor.expand_as, x1, x2)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tensor.expand_as, x3, x2)
+
+
+# Test python API
+class TestExpandAsV2API(unittest.TestCase):
+    def test_api(self):
+        input1 = np.random.random([12, 14]).astype("float32")
+        input2 = np.random.random([2, 12, 14]).astype("float32")
+        x = fluid.layers.data(
+            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+
+        y = fluid.layers.data(
+            name='target_tensor',
+            shape=[2, 12, 14],
+            append_batch_size=False,
+            dtype="float32")
+
+        out_1 = paddle.expand_as(x, y=y)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res_1 = exe.run(fluid.default_main_program(),
+                        feed={"x": input1,
+                              "target_tensor": input2},
+                        fetch_list=[out_1])
+        assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..aee6ca249f535b9c06c00a6806ac491be16cd4b3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -0,0 +1,234 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+
+# Situation 1: shape is a list(without tensor)
+class TestExpandV2OpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.attrs = {'shape': self.shape}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.shape = [100]
+        self.expand_times = [1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandV2OpRank2_DimExpanding(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.shape = [2, 120]
+        self.expand_times = [2, 1]
+
+
+class TestExpandV2OpRank2(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 140]
+        self.shape = [12, 140]
+        self.expand_times = [12, 1]
+
+
+class TestExpandV2OpRank3_Corner(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.shape = (2, 10, 5)
+        self.expand_times = (1, 1, 1)
+
+
+class TestExpandV2OpRank4(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.shape = (-1, -1, -1, -1)
+        self.expand_times = (1, 1, 1, 1)
+
+
+# Situation 2: shape is a list(with tensor)
+class TestExpandV2OpRank1_tensor_attr(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+        expand_shapes_tensor = []
+        for index, ele in enumerate(self.expand_shape):
+            expand_shapes_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'expand_shapes_tensor': expand_shapes_tensor,
+        }
+        self.attrs = {"shape": self.infer_expand_shape}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.expand_times = [1]
+        self.expand_shape = [100]
+        self.infer_expand_shape = [-1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.expand_times = [1, 1]
+        self.expand_shape = [12, 14]
+        self.infer_expand_shape = [12, -1]
+
+
+# Situation 3: shape is a tensor
+class TestExpandV2OpRank1_tensor(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'Shape': np.array(self.expand_shape).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.expand_times = [2, 1]
+        self.expand_shape = [2, 100]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+# Situation 4: input x is Integer
+class TestExpandV2OpInteger(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int32")
+        }
+        self.attrs = {'shape': [2, 4, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 5: input x is Bool
+class TestExpandV2OpBoolean(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
+        self.attrs = {'shape': [2, 4, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 56: input x is Integer
+class TestExpandV2OpInt64_t(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int64")
+        }
+        self.attrs = {'shape': [2, 4, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestExpandV2Error(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            shape = [2, 2]
+            self.assertRaises(TypeError, paddle.tensor.expand, x1, shape)
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
+            self.assertRaises(TypeError, paddle.tensor.expand, x2, shape)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tensor.expand, x3, shape)
+
+
+# Test python API
+class TestExpandV2API(unittest.TestCase):
+    def test_api(self):
+        input = np.random.random([12, 14]).astype("float32")
+        x = fluid.layers.data(
+            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+
+        positive_2 = fluid.layers.fill_constant([1], "int32", 12)
+        expand_shape = fluid.layers.data(
+            name="expand_shape",
+            shape=[2],
+            append_batch_size=False,
+            dtype="int32")
+
+        out_1 = paddle.expand(x, shape=[12, 14])
+        out_2 = paddle.expand(x, shape=[positive_2, 14])
+        out_3 = paddle.expand(x, shape=expand_shape)
+
+        g0 = fluid.backward.calc_gradient(out_2, x)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
+                                      feed={
+                                          "x": input,
+                                          "expand_shape":
+                                          np.array([12, 14]).astype("int32")
+                                      },
+                                      fetch_list=[out_1, out_2, out_3])
+        assert np.array_equal(res_1, np.tile(input, (1, 1)))
+        assert np.array_equal(res_2, np.tile(input, (1, 1)))
+        assert np.array_equal(res_3, np.tile(input, (1, 1)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py
index 1a0a4ecb74d56910b3f92924085203f83b2c0145..9b541c323eceaa32591dbdc2ec149868ad7e8673 100644
--- a/python/paddle/fluid/tests/unittests/test_eye_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eye_op.py
@@ -74,73 +74,70 @@ class TestEyeOp2(OpTest):
 
 class API_TestTensorEye(unittest.TestCase):
     def test_out(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10)
             place = fluid.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, dtype="float32")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10, num_columns=7, dtype="float64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, 7, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10, dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.imperative.guard():
-            out = paddle.eye(10, dtype="int64")
-            expected_result = np.eye(10, dtype="int64")
+        paddle.disable_static()
+        out = paddle.eye(10, dtype="int64")
+        expected_result = np.eye(10, dtype="int64")
+        paddle.enable_static()
         self.assertEqual((out.numpy() == expected_result).all(), True)
 
-        with paddle.imperative.guard():
-            batch_shape = [2]
-            out = fluid.layers.eye(10,
-                                   10,
-                                   dtype="int64",
-                                   batch_shape=batch_shape)
-            result = np.eye(10, dtype="int64")
-            expected_result = []
-            for index in reversed(batch_shape):
-                tmp_result = []
-                for i in range(index):
-                    tmp_result.append(result)
-                result = tmp_result
-                expected_result = np.stack(result, axis=0)
+        paddle.disable_static()
+        batch_shape = [2]
+        out = fluid.layers.eye(10, 10, dtype="int64", batch_shape=batch_shape)
+        result = np.eye(10, dtype="int64")
+        expected_result = []
+        for index in reversed(batch_shape):
+            tmp_result = []
+            for i in range(index):
+                tmp_result.append(result)
+            result = tmp_result
+            expected_result = np.stack(result, axis=0)
+        paddle.enable_static()
         self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
                          True)
         self.assertEqual((out.numpy() == expected_result).all(), True)
 
-        with paddle.imperative.guard():
-            batch_shape = [3, 2]
-            out = fluid.layers.eye(10,
-                                   10,
-                                   dtype="int64",
-                                   batch_shape=batch_shape)
-            result = np.eye(10, dtype="int64")
-            expected_result = []
-            for index in reversed(batch_shape):
-                tmp_result = []
-                for i in range(index):
-                    tmp_result.append(result)
-                result = tmp_result
-                expected_result = np.stack(result, axis=0)
+        paddle.disable_static()
+        batch_shape = [3, 2]
+        out = fluid.layers.eye(10, 10, dtype="int64", batch_shape=batch_shape)
+        result = np.eye(10, dtype="int64")
+        expected_result = []
+        for index in reversed(batch_shape):
+            tmp_result = []
+            for i in range(index):
+                tmp_result.append(result)
+            result = tmp_result
+            expected_result = np.stack(result, axis=0)
+        paddle.enable_static()
         self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
                          True)
         self.assertEqual((out.numpy() == expected_result).all(), True)
 
     def test_errors(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
 
             def test_num_rows_type_check():
                 paddle.eye(-1, dtype="int64")
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 0812b02b47db7fa2d43e1d3bbd0a3f7b59911326..b30e0a6775ea9901d8c2a3a56b2e80141fffd23c 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -31,45 +31,45 @@ def dequantize_max_abs(x, scale, max_range):
     return y
 
 
-def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
+def channel_wise_quantize_max_abs(x, quant_bit=8, quant_axis=0):
+    assert quant_axis in [0, 1], "The quant_axis should be 0 or 1."
     scales = []
-    if not use_second_dim:
+    y = x.copy()
+    max_range = math.pow(2, quant_bit - 1) - 1
+    if quant_axis == 0:
         for i in range(x.shape[0]):
-            scales.append(np.max(np.abs(x[i])).astype("float32"))
-        y = x.copy()
-        max_range = math.pow(2, quant_bit - 1) - 1
-        for i, scale in enumerate(scales):
-            y[i] = np.round(x[i] / scale * max_range)
-    else:
-        for i in range(x.shape[0]):
-            s = []
-            for j in range(x.shape[1]):
-                s.append(np.max(np.abs(x[i][j])).astype("float32"))
-            scales.append(s)
-        scales = np.amax(np.array(scales), axis=0)
-        y = x.copy()
-        max_range = math.pow(2, quant_bit - 1) - 1
-        for i in range(x.shape[0]):
-            for j, scale in enumerate(scales):
-                y[i][j] = np.round(x[i][j] / scale * max_range)
+            scale = np.max(np.abs(x[i])).astype("float32")
+            scales.append(scale)
+            y[i] = np.round(x[i] * max_range / scale)
+    elif quant_axis == 1:
+        for i in range(x.shape[1]):
+            scale = np.max(np.abs(x[:, i])).astype("float32")
+            scales.append(scale)
+            y[:, i] = np.round(x[:, i] * max_range / scale)
     return y, scales
 
 
 def channel_wise_dequantize_max_abs(x,
                                     scales,
                                     quant_bits,
+                                    quant_axis,
                                     activation_scale=None):
-    if activation_scale is None:
-        y = x.copy()
-        for i in range(x.shape[0]):
-            y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
+    assert quant_axis in [0, 1], "The quant_axis should be 0 or 1."
+
+    if isinstance(quant_bits, list):
+        max_range = math.pow(2, quant_bits[0] - 1) - 1
     else:
-        y = x.copy()
+        max_range = math.pow(2, quant_bits - 1) - 1
+    y = x.copy()
+    if quant_axis == 0:
         for i in range(x.shape[0]):
-            for j in range(x.shape[1]):
-                y[i][j] = (scales[j] /
-                           (math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
-        y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
+            y[i] = x[i] * scales[i] / max_range
+    elif quant_axis == 1:
+        for i in range(x.shape[1]):
+            y[:, i] = x[:, i] * scales[i] / max_range
+
+    if activation_scale is not None:
+        y = y * activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
     return y
 
 
@@ -83,9 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
         self.set_args()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
         x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(
-            x, self.quant_bits[0], use_second_dim=True)
-        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0], 1)
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, 1,
                                               self.activation_scale)
 
         self.inputs = {
@@ -105,25 +104,39 @@ class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
     def set_args(self):
         self.quant_bits = [8]
         self.data_type = "float32"
+        self.quant_axis = 0
 
     def setUp(self):
         self.set_args()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
         x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
-        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0],
+                                                   self.quant_axis)
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
+                                              self.quant_axis)
 
         self.inputs = {
             'X': yq,
             'Scales': [("scales0", np.array(scales).astype(self.data_type))]
         }
-        self.attrs = {'quant_bits': self.quant_bits}
+        self.attrs = {
+            'quant_bits': self.quant_bits,
+            'quant_axis': self.quant_axis
+        }
         self.outputs = {'Out': ydq}
 
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeChannelWiseDequantizeMaxAbsOpOneScale1(
+        TestFakeChannelWiseDequantizeMaxAbsOpOneScale):
+    def set_args(self):
+        self.quant_bits = [8]
+        self.data_type = "float32"
+        self.quant_axis = 1
+
+
 class TestFakeDequantizeMaxAbsOp(OpTest):
     def set_args(self):
         self.num_bits = 8
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 1c8335e3bceab24cba9364a96f6907d2cf585fe0..7835fd3f53ddb7f9a95313c6cc5fc7b72ae6d664 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -72,28 +72,62 @@ class TestFakeQuantizeOp2(OpTest):
 
 class TestFakeChannelWiseQuantizeOp(OpTest):
     def setUp(self):
+        self.set_arg()
+        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+
         self.op_type = "fake_channel_wise_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {
-            'X': np.random.random((4, 3, 64, 64)).astype("float32"),
-        }
+        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+
         scales = []
-        for i in range(self.inputs['X'].shape[0]):
-            scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
         outputs = self.inputs['X'].copy()
-        for i, scale in enumerate(scales):
-            outputs[i] = np.round(outputs[i] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1))
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        if self.quant_axis == 0:
+            for i in range(self.inputs['X'].shape[0]):
+                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
+                scales.append(scale_v)
+                outputs[i] = np.round(outputs[i] / scale_v * bnt)
+        elif self.quant_axis == 1:
+            for i in range(self.inputs['X'].shape[1]):
+                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
+                    "float32")
+                scales.append(scale_v)
+                outputs[:, i] = np.round(outputs[:, i] / scale_v * bnt)
 
         self.outputs = {
             'Out': outputs,
             'OutScale': np.array(scales).astype("float32"),
         }
 
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {
+            'X': np.random.random((20, 15, 6, 6)).astype("float32"),
+        }
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeChannelWiseQuantizeOp1(TestFakeChannelWiseQuantizeOp):
+    def set_quant_axis(self):
+        self.quant_axis = 1
+        self.inputs = {
+            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
+        }
+
+
+class TestFakeChannelWiseQuantizeOp2(TestFakeChannelWiseQuantizeOp):
+    def set_quant_axis(self):
+        self.quant_axis = 0
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
+class TestFakeChannelWiseQuantizeOp3(TestFakeChannelWiseQuantizeOp):
+    def set_quant_axis(self):
+        self.quant_axis = 1
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_range_abs_max"
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index aefc809bd5cb852d3fde95dff4550e506c5f1c12..3475320eeebc55a14dd569410610b70ae35e65a3 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -305,14 +305,18 @@ class TestFillConstantImperative(unittest.TestCase):
         with fluid.dygraph.guard():
             data1 = np.array([1, 2]).astype('int32')
             data2 = np.array([1.1]).astype('float32')
+            data3 = np.array([88]).astype('int32')
             shape = fluid.dygraph.to_variable(data1)
             val = fluid.dygraph.to_variable(data2)
+            value = fluid.dygraph.to_variable(data3)
             res1 = fluid.layers.fill_constant(
                 shape=[1, 2], dtype='float32', value=1.1)
             res2 = fluid.layers.fill_constant(
                 shape=shape, dtype='float32', value=1.1)
             res3 = fluid.layers.fill_constant(
                 shape=shape, dtype='float32', value=val)
+            res4 = fluid.layers.fill_constant(
+                shape=shape, dtype='int32', value=value)
             assert np.array_equal(
                 res1.numpy(), np.full(
                     [1, 2], 1.1, dtype="float32"))
@@ -322,6 +326,9 @@ class TestFillConstantImperative(unittest.TestCase):
             assert np.array_equal(
                 res3.numpy(), np.full(
                     [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res4.numpy(), np.full(
+                    [1, 2], 88, dtype="int32"))
 
 
 class TestFillConstantOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index 6d67afe6cbfbb036ef54738a72d86ed798625112..642044bb4b1152b0c6d2b5a8a64e22410f9bd151 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -145,19 +145,22 @@ class TestFlatten2OpError(unittest.TestCase):
         x = x.astype('float32')
 
         def test_ValueError1():
-            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
             out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError1)
 
         def test_ValueError2():
-            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
             paddle.flatten(x_var, start_axis=10, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError2)
 
         def test_ValueError3():
-            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
             paddle.flatten(x_var, start_axis=2, stop_axis=10)
 
         self.assertRaises(ValueError, test_ValueError3)
@@ -191,8 +194,8 @@ class TestFlattenPython(unittest.TestCase):
         self.assertRaises(ValueError, test_InputError)
 
         def test_Negative():
-            paddle.enable_imperative()
-            img = paddle.imperative.to_variable(x)
+            paddle.disable_static()
+            img = paddle.to_variable(x)
             out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
             return out.numpy().shape
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet.py b/python/paddle/fluid/tests/unittests/test_fleet.py
index 449f31faf4035971f996e76612f10c882ce9179c..a705d5ee661fd5d0d28b791d6db4624b78281743 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet.py
@@ -34,7 +34,8 @@ class TestFleet1(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -48,10 +49,10 @@ class TestFleet1(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index ae4b5d7ecd7c5131e38904a0d8fde0b9bb4fbb89..38c3903306e6e76188cdb50476d6797814c434e9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import unittest
 import paddle
 import os
@@ -23,8 +25,6 @@ class TestFleetAMPOptimizer(unittest.TestCase):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
     def test_amp_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         input_x = paddle.fluid.layers.data(
@@ -38,7 +38,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
         strategy.amp_configs = {
             "init_loss_scaling": 32768,
@@ -51,7 +51,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
             "custom_black_list": ['tanh'],
         }
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 20542da3f05ec84b51dee8a9c5913bb20630f4a2..9e651dea24ba7f35f3785093da8ac73dde07be5a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -14,7 +14,10 @@
 
 import unittest
 import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import os
+import paddle.fluid as fluid
 
 
 class TestFleetBase(unittest.TestCase):
@@ -26,67 +29,49 @@ class TestFleetBase(unittest.TestCase):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_init(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
 
     def test_is_first_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_first_worker():
             print("test fleet first worker done.")
 
     def test_worker_index(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         print(fleet.worker_index())
 
     def test_worker_num(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         print(fleet.worker_num())
 
     def test_is_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             print("test fleet is worker")
 
     def test_worker_endpoints(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         print(fleet.worker_endpoints(to_string=True))
 
     def test_server_num(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
             print("fleet server num: {}".format(fleet.server_num()))
 
     def test_server_index(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
             print("fleet server index: {}".format(fleet.server_index()))
 
     def test_server_endpoints(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
@@ -94,83 +79,50 @@ class TestFleetBase(unittest.TestCase):
                 fleet.server_endpoints(to_string=True)))
 
     def test_is_server(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
             print("test fleet is server")
 
     def test_util(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         self.assertEqual(fleet.util, None)
 
     def test_barrier_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.barrier_worker()
 
     def test_init_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.init_worker()
 
     def test_run_server(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.run_worker()
 
     def test_stop_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.stop_worker()
 
     def test_distributed_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-
-    def test_minimize(self):
-        import paddle
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        strategy = fleet.DistributedStrategy()
         optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+    def test_exception(self):
+        import paddle.distributed.fleet as fleet
+        self.assertRaises(Exception, fleet.init_worker)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d666ea6740be149723e3bdbc00857a8931ce318e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_ps_minimize(self):
+        import paddle
+        import paddle.distributed.fleet as fleet
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = fleet.PaddleCloudRoleMaker(is_collective=False)
+        fleet.init(role)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        pe = fluid.ParallelExecutor(use_cuda=False, loss_name=avg_cost.name)
+        compiled_prog = fluid.compiler.CompiledProgram(
+            fluid.default_main_program())
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor=pe)
+
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor="exe")
+
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor=exe,
+            main_program=compiled_prog)
+
+        self.assertRaises(
+            Exception, fleet.save_persistables, executor=pe, dirname='/tmp/')
+
+        self.assertRaises(
+            Exception, fleet.save_persistables, executor="exe", dirname='/tmp/')
+
+        self.assertRaises(
+            Exception,
+            fleet.save_persistables,
+            executor=exe,
+            dirname='/tmp/',
+            main_program=compiled_prog)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e888ab0eb3ca597bf62245ff9f3024fe81ee95
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_collective_minimize(self):
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        strategy = fleet.DistributedStrategy()
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b3fbb86a4af55d6838df3a628bf2cf194c5235d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_fleet_init(self):
+        import paddle.distributed.fleet as fleet
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        role = fleet.PaddleCloudRoleMaker(is_collective=False)
+        fleet.init(role)
+        fleet.init()
+        fleet.init(is_collective=False)
+        self.assertRaises(Exception, fleet.init, is_collective="F")
+        self.assertRaises(Exception, fleet.init, role_maker="F")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
old mode 100644
new mode 100755
index 0590650bd02f5535b9c35bae187e77bc7274901c..55d4ff7726aace09e486156d26efdecf22b310a5
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -14,9 +14,10 @@
 
 import unittest
 import paddle
+from paddle import fluid
 import os
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetDGCOptimizer(unittest.TestCase):
@@ -25,32 +26,42 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         os.environ[
             "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
-    def net(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
-        strategy.dgc = True
-        strategy.dgc_configs = {
-            "rampup_begin_step": 128,
-            "rampup_step": 100,
-            "sparsity": [0.996, 0.999]
-        }
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.dgc = True
+                strategy.dgc_configs = {
+                    "rampup_begin_step": 128,
+                    "rampup_step": 100,
+                    "sparsity": [0.996, 0.999]
+                }
         return avg_cost, strategy
 
     def test_dgc_optimizer(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -59,8 +70,10 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         self.assertIn('dgc_momentum', ops)
 
     def test_dgc_not_apply_with_adam(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -72,8 +85,11 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 4994e4514d784f16006d25b4d714bfffc80af2de..8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -19,7 +19,7 @@ import os
 
 class TestStrategyConfig(unittest.TestCase):
     def test_amp(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
         self.assertEqual(strategy.amp, True)
         strategy.amp = False
@@ -28,7 +28,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.amp, False)
 
     def test_amp_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {
             "init_loss_scaling": 32768,
             "decr_every_n_nan_or_inf": 2,
@@ -41,7 +41,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.amp_configs["init_loss_scaling"], 32768)
 
     def test_recompute(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.recompute = True
         self.assertEqual(strategy.recompute, True)
         strategy.recompute = False
@@ -50,13 +50,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.recompute, False)
 
     def test_recompute_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"checkpoints": ["x", "y"]}
         strategy.recompute_configs = configs
         self.assertEqual(len(strategy.recompute_configs["checkpoints"]), 2)
 
     def test_pipeline(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.pipeline = True
         self.assertEqual(strategy.pipeline, True)
         strategy.pipeline = False
@@ -65,13 +65,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.pipeline, False)
 
     def test_pipeline_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"micro_batch": 4}
         strategy.pipeline_configs = configs
         self.assertEqual(strategy.pipeline_configs["micro_batch"], 4)
 
     def test_localsgd(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
         self.assertEqual(strategy.localsgd, True)
         strategy.localsgd = False
@@ -80,13 +80,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.localsgd, False)
 
     def test_localsgd_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"k_steps": 4}
         strategy.localsgd_configs = configs
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
 
     def test_dgc(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.dgc = True
         self.assertEqual(strategy.dgc, True)
         strategy.dgc = False
@@ -95,7 +95,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.dgc, False)
 
     def test_sync_nccl_allreduce(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.sync_nccl_allreduce = True
         self.assertEqual(strategy.sync_nccl_allreduce, True)
         strategy.sync_nccl_allreduce = False
@@ -104,14 +104,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.sync_nccl_allreduce, False)
 
     def test_nccl_comm_num(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.nccl_comm_num = 1
         self.assertEqual(strategy.nccl_comm_num, 1)
         strategy.nccl_comm_num = "2"
         self.assertEqual(strategy.nccl_comm_num, 1)
 
     def test_use_hierarchical_allreduce(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.use_hierarchical_allreduce = True
         self.assertEqual(strategy.use_hierarchical_allreduce, True)
         strategy.use_hierarchical_allreduce = False
@@ -120,14 +120,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.use_hierarchical_allreduce, False)
 
     def test_hierarchical_allreduce_inter_nranks(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.hierarchical_allreduce_inter_nranks = 8
         self.assertEqual(strategy.hierarchical_allreduce_inter_nranks, 8)
         strategy.hierarchical_allreduce_inter_nranks = "4"
         self.assertEqual(strategy.hierarchical_allreduce_inter_nranks, 8)
 
     def test_sync_batch_norm(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.sync_batch_norm = True
         self.assertEqual(strategy.sync_batch_norm, True)
         strategy.sync_batch_norm = False
@@ -136,7 +136,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.sync_batch_norm, False)
 
     def test_fuse_all_reduce_ops(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.fuse_all_reduce_ops = True
         self.assertEqual(strategy.fuse_all_reduce_ops, True)
         strategy.fuse_all_reduce_ops = False
@@ -145,21 +145,21 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.fuse_all_reduce_ops, False)
 
     def test_fuse_grad_size_in_MB(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.fuse_grad_size_in_MB = 50
         self.assertEqual(strategy.fuse_grad_size_in_MB, 50)
         strategy.fuse_grad_size_in_MB = "40"
         self.assertEqual(strategy.fuse_grad_size_in_MB, 50)
 
     def test_fuse_grad_size_in_TFLOPS(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy._fuse_grad_size_in_TFLOPS = 0.1
         self.assertGreater(strategy._fuse_grad_size_in_TFLOPS, 0.09)
         strategy._fuse_grad_size_in_TFLOPS = "0.3"
         self.assertGreater(strategy._fuse_grad_size_in_TFLOPS, 0.09)
 
     def test_gradient_merge(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.gradient_merge = True
         self.assertEqual(strategy.gradient_merge, True)
         strategy.gradient_merge = False
@@ -168,13 +168,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.gradient_merge, False)
 
     def test_gradient_merge_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"k_steps": 4}
         strategy.gradient_merge_configs = configs
         self.assertEqual(strategy.gradient_merge_configs["k_steps"], 4)
 
     def test_lars(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.lars = True
         self.assertEqual(strategy.lars, True)
         strategy.lars = False
@@ -183,7 +183,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.lars, False)
 
     def test_lamb(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.lamb = True
         self.assertEqual(strategy.lamb, True)
         strategy.lamb = False
@@ -192,22 +192,23 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.lamb, False)
 
     def test_a_sync(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         self.assertEqual(strategy.a_sync, True)
         strategy.a_sync = False
         self.assertEqual(strategy.a_sync, False)
-        strategy.a_sync = "True"
-        self.assertEqual(strategy.a_sync, False)
+
+        with self.assertRaises(ValueError):
+            strategy.a_sync = "True"
 
     def test_a_sync_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"k_steps": 1000}
         strategy.a_sync_configs = configs
         self.assertEqual(strategy.a_sync_configs["k_steps"], 1000)
 
     def test_elastic(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.elastic = True
         self.assertEqual(strategy.elastic, True)
         strategy.elastic = False
@@ -216,7 +217,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.elastic, False)
 
     def test_auto(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
         self.assertEqual(strategy.auto, True)
         strategy.auto = False
@@ -225,7 +226,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.auto, False)
 
     def test_strategy_prototxt(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         strategy.localsgd = True
         strategy.dgc = True
@@ -254,7 +255,7 @@ class TestStrategyConfig(unittest.TestCase):
         exe_strategy.num_iteration_per_run = 10
         strategy.execution_strategy = exe_strategy
         strategy.save_to_prototxt("dist_strategy.prototxt")
-        strategy2 = paddle.fleet.DistributedStrategy()
+        strategy2 = paddle.distributed.fleet.DistributedStrategy()
         strategy2.load_from_prototxt("dist_strategy.prototxt")
         self.assertEqual(strategy.dgc, strategy2.dgc)
 
@@ -276,7 +277,7 @@ class TestStrategyConfig(unittest.TestCase):
         build_strategy.enable_backward_optimizer_op_deps = True
         build_strategy.trainers_endpoints = ["1", "2"]
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.build_strategy = build_strategy
 
     def test_execution_strategy(self):
@@ -285,9 +286,36 @@ class TestStrategyConfig(unittest.TestCase):
         exe_strategy.num_iteration_per_drop_scope = 10
         exe_strategy.num_iteration_per_run = 10
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.execution_strategy = exe_strategy
 
+    def test_unknown_strategy(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        with self.assertRaises(TypeError):
+            strategy.unknown_key = 'UNK'
+
+    def test_cudnn_exhaustive_search(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_exhaustive_search = False
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+        strategy.cudnn_exhaustive_search = "True"
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+
+    def test_cudnn_batchnorm_spatial_persistent(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_batchnorm_spatial_persistent = False
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+        strategy.cudnn_batchnorm_spatial_persistent = "True"
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+
+    def test_conv_workspace_size_limit(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.conv_workspace_size_limit = 1000
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy.conv_workspace_size_limit = "400"
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy._enable_env()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
index 36d5912cb7eff23dfde9ef3f12fbc2b782b2ccd3..af72df5186876a8bcbaa5bfa6d71a27fdf46b119 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
@@ -15,8 +15,8 @@
 import unittest
 import paddle
 import os
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
@@ -41,10 +41,10 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.gradient_merge = True
         strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 7998b1fa5d12e4ca3b7da0f71ed295957f86a279..9eec73116cc283b58d3ee39cefb9256e12d4ef15 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -15,34 +15,44 @@
 import unittest
 import paddle
 import os
-from launch_function_helper import launch_func
+from launch_function_helper import launch_func, wait, _find_free_port
 
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        try:
+            self._dist_ut_port_0 = int(os.environ["PADDLE_DIST_UT_PORT"])
+            self._dist_ut_port_1 = self._dist_ut_port_0 + 1
+        except Exception as e:
+            self._dist_ut_port_0 = _find_free_port(set())
+            self._dist_ut_port_1 = _find_free_port(set())
+
     def test_graph_execution_optimizer_not_apply(self):
+        port_a = self._dist_ut_port_0
+        port_b = self._dist_ut_port_1
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         def node_func():
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
@@ -57,8 +67,8 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-            strategy = paddle.fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
@@ -67,33 +77,35 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
     def test_graph_execution_optimizer(self):
+        port_a = self._dist_ut_port_0 + 2
+        port_b = self._dist_ut_port_1 + 2
+
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         def node_func():
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
@@ -108,13 +120,134 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+            exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
+            exe.run(paddle.fluid.default_startup_program())
+
+            import numpy as np
+
+            def gen_data():
+                return {
+                    "x": np.random.random(size=(128, 32)).astype('float32'),
+                    "y": np.random.randint(
+                        2, size=(128, 1)).astype('int64')
+                }
+
+            for i in range(10):
+                cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
+                print("cost of step[{}] = {}".format(i, cost_val))
+
+        proc_a = launch_func(node_func, node_a)
+        proc_a.start()
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
+        wait([proc_a, proc_b])
+
+    def test_graph_execution_optimizer_not_apply_v2(self):
+        port_a = self._dist_ut_port_0 + 4
+        port_b = self._dist_ut_port_1 + 4
+        node_a = {
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        node_b = {
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        def node_func():
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
+
+        proc_a = launch_func(node_func, node_a)
+        proc_a.start()
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
+        wait([proc_a, proc_b])
+
+    def test_graph_execution_optimizer(self):
+        port_a = self._dist_ut_port_0 + 6
+        port_b = self._dist_ut_port_1 + 6
+        node_a = {
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        node_b = {
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        def node_func():
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.nccl_comm_num = 2
+            strategy.sync_nccl_allreduce = True
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
             exe.run(paddle.fluid.default_startup_program())
 
@@ -135,8 +268,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 47e8949922a01855c6d1f1947f0b8b5282da3c48..69f5b134888b0f3268cea112eeefd9fb7fd0127f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -14,6 +14,8 @@
 
 import unittest
 import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 from launch_function_helper import launch_func
 
@@ -39,8 +41,6 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         }
 
         def node_func():
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
             role = role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             input_x = paddle.fluid.layers.data(
@@ -57,10 +57,10 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..3f140f53b043b1949572f3728ca8a0c556317783
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetLambMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.lamb = True
+                strategy.lamb_configs = {
+                    'lamb_weight_decay': 0.01,
+                    'exclude_from_weight_decay': [],
+                }
+
+        return avg_cost, strategy
+
+    def test_lamb_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lamb', ops)
+
+    def test_lamb_not_apply_with_momentum(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.1, momentum=0.9)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('lamb', ops)
+
+    def test_lamb_exclude_fn(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        strategy.lamb_configs = {
+            'lamb_weight_decay': 0.01,
+            'exclude_from_weight_decay': ['.b_0'],
+        }
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops_with_bias = [
+            op for op in avg_cost.block.ops
+            if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
+        ]
+        for op in ops_with_bias:
+            self.assertEqual(op.attr('weight_decay'), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 960ffbd4035f9c1891a205cd8afbd1ca581284bd..3caa1a4eac0bf191b13e6708b1a9adffdb111ca7 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -14,9 +14,10 @@
 
 import unittest
 import paddle
+from paddle import fluid
 import os
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
@@ -27,32 +28,42 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
                        "127.0.0.1:36001,127.0.0.2:36001"
 
-    def net(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
-        strategy.lars = True
-        strategy.lars_configs = {
-            "lars_coeff": 0.001,
-            "lars_weight_decay": 0.0005,
-        }
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.lars = True
+                strategy.lars_configs = {
+                    "lars_coeff": 0.001,
+                    "lars_weight_decay": 0.0005,
+                }
 
         return avg_cost, strategy
 
     def test_lars_optimizer(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -60,8 +71,12 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         self.assertIn('lars_momentum', ops)
 
     def test_lars_not_apply_with_adam(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index 5e5c4e17f5b97b12b17c8145c449327bbdad1967..c5edc96963408bf1fad793f7271d75159934f019 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -10,6 +10,22 @@ function test_launch_ps(){
         echo "test pserver launch failed"
         exit -1
     fi
+
+    fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
+    if grep -q "server are killed" ut.elog; then
+        echo "test pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
+
+    fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
+    if grep -q "server are killed" ut.elog; then
+        echo "test pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
 }
 
 if [[ ${WITH_GPU} == "OFF" ]]; then
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 1f2ceb298e72edda3da7d4ddd00444208bb21591..07b988bf8752057e68925bc42f564a72d466361d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -16,8 +16,8 @@ import unittest
 import paddle
 import os
 
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
@@ -39,14 +39,14 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
         strategy.auto = True
         config = strategy.localsgd_configs
         config['k_steps'] = 1
         strategy.localsgd_configs = config
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
new file mode 100755
index 0000000000000000000000000000000000000000..dfea848aadfc44c57c91c11d196eff49d57cab08
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
+
+
+class TestFleetMetaOptimizerBase(unittest.TestCase):
+    def net(main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+                opt = MetaOptimizerBase(optimizer)
+                opt_ops, params_grads = opt.minimize(avg_cost)
+                opt.apply_optimize(avg_cost,
+                                   paddle.static.default_startup_program(),
+                                   params_grads)
+        return None
+
+    net(fluid.default_startup_program(), fluid.default_main_program())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_metric.py b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
index 2dacc02797a251b894118e170f5a287a848790fc..6a7963f43824f61960e6523ceb5c618f783e8a7a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_metric.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
@@ -19,7 +19,7 @@ import paddle
 import paddle.fluid as fluid
 import os
 import unittest
-import paddle.fleet.metrics.metric as metric
+import paddle.distributed.fleet.metrics.metric as metric
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
index 7b7e3c7c4173fe34368d6f4207491b3800907f57..b2b6136797ba460f9f829d5df4c7041664b424cb 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
@@ -33,7 +33,8 @@ class TestFleet1(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -47,10 +48,10 @@ class TestFleet1(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
index 0005a4a8dbebff04cd9b11d0af082b01c718ca48..adbb1268c6f4d7b21876dacdbbb3cf453a14d0f4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
@@ -19,11 +19,13 @@ import os
 
 class TestFleetMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def test_pipeline_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         with paddle.fluid.device_guard("cpu"):
@@ -47,11 +49,11 @@ class TestFleetMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.pipeline = True
         strategy.pipeline_configs = {'micro_batch': 2}
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
index ec99acf109816570db48d9f15bbbdd897133006a..beec6d7f51c4f54ba5cbf2af255afd6082cedd52 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
@@ -36,7 +36,7 @@ class TestFleetPrivateFunction(unittest.TestCase):
         thr = threading.Thread(target=init_server, args=(9292, ))
         thr.start()
 
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
         ep = ["127.0.0.1:9292"]
         fleet.base.private_helper_function.wait_server_ready(ep)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index f62c8d32d6cfa3bbcc20e9b5f862387f05d475fb..a42010a4eaa5066821adb817e7a5df2b81bedf7c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -26,8 +26,8 @@ class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_recompute_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         input_x = paddle.fluid.layers.data(
@@ -41,11 +41,11 @@ class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.recompute = True
-        strategy.recompute_configs = {"checkpoints": ["fc2"]}
+        strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
index 3abad755ac1755ddd62859fae45a14e6aaf528ee..7f1ad5d52d8f0b5a6b5bd83ea3a158f123e870ea 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
@@ -61,7 +61,8 @@ class TestCloudRoleMaker(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -75,10 +76,11 @@ class TestCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #print("init rolemaker")
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index 351dc0a5d0f66d91e063c0ef3ce84cd3756c0860..eb5d9eb66608dd397dad773158c337fc67be2dbb 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -163,7 +163,7 @@ class TestCloudRoleMaker2(unittest.TestCase):
             data = "1 1 1 1\n"
             f.write(data)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
         dataset.set_use_var([show, label])
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
index 39d3d2a2a042c74f2af0e92dd740a28ef60a5d5d..0fa852eeeebe9c8fbb056fca388a0af2c8f92842 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
@@ -33,7 +33,8 @@ class TestCloudRoleMaker(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -50,10 +51,10 @@ class TestCloudRoleMaker(unittest.TestCase):
             init_timeout_seconds=100,
             run_timeout_seconds=100,
             http_ip_port="127.0.0.1:36003")
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
index a91f6cbd69e18e949b14787f46923c6df11e9b04..6414ef18d635aea4b73c17a4931c37e596ed6029 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
             from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             from paddle.fluid.incubate.fleet.base.role_maker import \
                 GeneralRoleMaker
-            from paddle.fleet.utils import KVHandler
-            from paddle.fleet.utils import KVServer
-            from paddle.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils import KVHandler
+            from paddle.distributed.fleet.utils import KVServer
+            from paddle.distributed.fleet.utils import KVHTTPServer
         except:
             print("warning: no fleet, skip test_pslib_4")
             return
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 659cc34b5495894e883f10fb73a56719c9c58442..cf9b3e1e9a1605a714b47d99183511b24c903722 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -16,7 +16,7 @@
 from __future__ import print_function
 import os
 import unittest
-import paddle.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestRoleMakerBase(unittest.TestCase):
@@ -34,6 +34,7 @@ class TestRoleMakerBase(unittest.TestCase):
         self.assertRaises(Exception, role.worker_index)
         self.assertRaises(Exception, role.server_index)
         self.assertRaises(Exception, role.role_id)
+        self.assertRaises(Exception, role.node_num)
 
         trainer_endpoints = role.get_trainer_endpoints()
         self.assertTrue(len(trainer_endpoints) == 0)
@@ -80,10 +81,12 @@ class TestCloudRoleMaker(unittest.TestCase):
         worker_endpoints = ro.get_trainer_endpoints()
         self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
         self.assertEqual(ro.role_id(), 0)
+        self.assertEqual(ro.node_num(), 2)
 
     def test_tr_rolemaker_collective(self):
         ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
         self.assertEqual(ro.worker_num(), 2)
+        self.assertEqual(ro.node_num(), 2)
 
     def test_ps_rolemaker(self):
         """Test ps rolemaker."""
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
index 474e5da1c219c4b6e5a35a59ee235fdcbdb34cce..80109716a54e52dc6050b724046561f37020a645 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
@@ -19,21 +19,45 @@ import os
 
 class TestFleetRuntime(unittest.TestCase):
     def test_fleet_runtime_base(self):
-        import paddle.fleet.runtime
-        base = paddle.fleet.runtime.runtime_base.RuntimeBase()
+        import paddle.distributed.fleet.runtime
+        base = paddle.distributed.fleet.runtime.runtime_base.RuntimeBase()
         base._run_worker()
         base._init_server()
         base._run_server()
         base._stop_worker()
+        base._save_inference_model()
+        base._save_persistables()
 
     def test_fleet_collective_runtime(self):
-        import paddle.fleet.runtime
-        collective_runtime = paddle.fleet.runtime.CollectiveRuntime()
+        import paddle.distributed.fleet.runtime
+        collective_runtime = paddle.distributed.fleet.runtime.CollectiveRuntime(
+        )
         collective_runtime._init_worker()
         collective_runtime._run_worker()
         collective_runtime._init_worker()
         collective_runtime._run_server()
         collective_runtime._stop_worker()
+        collective_runtime._save_inference_model()
+        collective_runtime._save_persistables()
+
+    def test_fleet_ps_runtime(self):
+        ps_runtime = paddle.distributed.fleet.runtime.ParameterServerRuntime()
+        self.assertRaises(Exception, ps_runtime._get_optimizer_status,
+                          "test_op", None)
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status("adam",
+                                                                        "param")
+        self.assertTrue(
+            len(reshaped_names) == 2 and
+            reshaped_names[0] == 'param_moment1_0' and
+            reshaped_names[1] == 'param_moment2_0')
+        self.assertTrue(
+            len(origin_names) == 2 and
+            origin_names[0] == 'param_beta1_pow_acc_0' and
+            origin_names[1] == 'param_beta2_pow_acc_0')
+
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status("sgd",
+                                                                        "param")
+        self.assertTrue(len(reshaped_names) == 0 and len(origin_names) == 0)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
index 3b0e8be63d95f29ccd1da145403a7a441698fead..7a255e5da14dacc1a5552642640e3ffe1e4eaad4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
@@ -33,7 +33,8 @@ class TestFleet1(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -47,10 +48,10 @@ class TestFleet1(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index e52cb5f920c2ebdf54c8b3e64cf61d16baaeadf4..dde36e073fb20eed3b17c79a886739f59ecb185d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -22,8 +22,8 @@ import tempfile
 import os
 import sys
 from paddle.dataset.common import download, DATA_HOME
-from paddle.fleet.base.util_factory import fleet_util
-import paddle.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetUtil(unittest.TestCase):
@@ -34,7 +34,7 @@ class TestFleetUtil(unittest.TestCase):
     train_dir = os.path.join("fleet_util_data", "train_program")
 
     def test_util_base(self):
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
         util = fleet.UtilBase()
         strategy = fleet.DistributedStrategy()
         util._set_strategy(strategy)
@@ -42,7 +42,7 @@ class TestFleetUtil(unittest.TestCase):
         util._set_role_maker(role_maker)
 
     def test_util_factory(self):
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
         factory = fleet.base.util_factory.UtilFactory()
         strategy = fleet.DistributedStrategy()
         role_maker = None  # should be fleet.PaddleCloudRoleMaker()
@@ -55,15 +55,15 @@ class TestFleetUtil(unittest.TestCase):
         self.assertEqual(util.role_maker, None)
 
     def test_get_util(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         default_util = fleet.util
         self.assertEqual(default_util, None)
 
     def test_set_user_defined_util(self):
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
 
         class UserDefinedUtil(fleet.UtilBase):
             def __init__(self):
@@ -72,7 +72,7 @@ class TestFleetUtil(unittest.TestCase):
             def get_user_id(self):
                 return 10
 
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
@@ -81,7 +81,7 @@ class TestFleetUtil(unittest.TestCase):
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils import LocalFS
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index 6d78d3a47361d63fc722758fd741a30bff0024d7..c01876531c99c610706265ff646d93c4a197a26e 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,7 +20,7 @@ import os
 import sys
 import inspect
 
-from paddle.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 
 class FSTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index 21cbab193419be9413c487c8631671097016d959..ba14aeae990329915e080969ca74b8a9658632e9 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid.core as core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 import paddle.compat as cpt
 import unittest
 import numpy as np
@@ -38,7 +38,7 @@ class TestFullOp(unittest.TestCase):
             place = paddle.CPUPlace()
             if core.is_compiled_with_cuda():
                 place = paddle.CUDAPlace(0)
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             exe.run(startup_program)
 
             img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
@@ -53,12 +53,13 @@ class TestFullOp(unittest.TestCase):
                 msg="full_like output is wrong, out = " + str(out_np))
 
     def test_full_like_imperative(self):
-        with paddle.imperative.guard():
-            input = paddle.arange(6, 10, dtype='float32')
-            out = paddle.full_like(input, fill_value=888.88, dtype='float32')
-            out_numpy = np.random.random((4)).astype("float32")
-            out_numpy.fill(888.88)
-            self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.disable_static()
+        input = paddle.arange(6, 10, dtype='float32')
+        out = paddle.full_like(input, fill_value=888.88, dtype='float32')
+        out_numpy = np.random.random((4)).astype("float32")
+        out_numpy.fill(888.88)
+        self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.enable_static()
 
 
 class TestFullOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index c43454eaaee9e3b2f9aa371453e58b009c99a52c..68be0bf5d561ef0d8fe92005dd9ddb47c21aca51 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -37,7 +37,6 @@ class TestFunctionalConv2D(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def prepare(self):
@@ -88,7 +87,6 @@ class TestFunctionalConv2D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -121,9 +119,11 @@ class TestFunctionalConv2D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -144,10 +144,12 @@ class TestFunctionalConv2D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+
             out = y.numpy()
         return out
 
@@ -185,7 +187,6 @@ class TestFunctionalConv2DError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def test_exception(self):
@@ -228,9 +229,7 @@ class TestFunctionalConv2DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
@@ -383,21 +382,6 @@ class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
         self.data_format = "NCHW"
 
 
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCHW"
-
-
 class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
     def setUp(self):
         self.in_channels = 3
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index 21986f1b98d869289ddb34a65316aca57c83f9d9..1fb07bf4345909deb5485a89232270336658ae8b 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -37,8 +37,6 @@ class TestFunctionalConv2D(TestCase):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def prepare(self):
@@ -90,8 +88,6 @@ class TestFunctionalConv2D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
@@ -115,7 +111,7 @@ class TestFunctionalConv2D(TestCase):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
+                y = F.conv_transpose2d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -124,9 +120,7 @@ class TestFunctionalConv2D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -140,7 +134,7 @@ class TestFunctionalConv2D(TestCase):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d_transpose(
+            y = F.conv_transpose2d(
                 x,
                 weight,
                 bias,
@@ -148,10 +142,8 @@ class TestFunctionalConv2D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
             out = y.numpy()
         return out
 
@@ -189,8 +181,6 @@ class TestFunctionalConv2DError(TestCase):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def test_exception(self):
@@ -225,7 +215,7 @@ class TestFunctionalConv2DError(TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
+                y = F.conv_transpose2d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -234,9 +224,7 @@ class TestFunctionalConv2DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
@@ -249,8 +237,6 @@ class TestFunctionalConv2DCase2(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -264,8 +250,6 @@ class TestFunctionalConv2DCase3(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = True
-        self.act = None
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -279,8 +263,6 @@ class TestFunctionalConv2DCase4(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -294,8 +276,6 @@ class TestFunctionalConv2DCase5(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -309,8 +289,6 @@ class TestFunctionalConv2DCase6(TestFunctionalConv2D):
         self.dilation = (2, 1)
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -324,8 +302,6 @@ class TestFunctionalConv2DCase7(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 4
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NHWC"
 
 
@@ -340,8 +316,6 @@ class TestFunctionalConv2DCase8(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -355,8 +329,6 @@ class TestFunctionalConv2DCase9(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -370,8 +342,6 @@ class TestFunctionalConv2DCase10(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -385,8 +355,6 @@ class TestFunctionalConv2DCase11(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -400,8 +368,6 @@ class TestFunctionalConv2DCase12(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -415,8 +381,6 @@ class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -430,8 +394,6 @@ class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -445,8 +407,6 @@ class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -460,23 +420,6 @@ class TestFunctionalConv2DErrorCase5(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCHW"
 
 
@@ -491,8 +434,6 @@ class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -506,8 +447,6 @@ class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -521,8 +460,6 @@ class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 195e3812f94843f6ccdd05cbc317238765e4c06b..b413a56c07a9ce3afbe15baffbffaf92a3d42129 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -37,7 +37,6 @@ class TestFunctionalConv3D(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def prepare(self):
@@ -88,7 +87,6 @@ class TestFunctionalConv3D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -121,9 +119,11 @@ class TestFunctionalConv3D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -144,10 +144,12 @@ class TestFunctionalConv3D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+
             out = y.numpy()
         return out
 
@@ -185,7 +187,6 @@ class TestFunctionalConv3DError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def test_exception(self):
@@ -228,9 +229,10 @@ class TestFunctionalConv3DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DCase2(TestFunctionalConv3D):
@@ -244,7 +246,6 @@ class TestFunctionalConv3DCase2(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -259,7 +260,6 @@ class TestFunctionalConv3DCase3(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -274,7 +274,6 @@ class TestFunctionalConv3DCase4(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -289,7 +288,6 @@ class TestFunctionalConv3DCase5(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -304,7 +302,6 @@ class TestFunctionalConv3DCase6(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -319,7 +316,6 @@ class TestFunctionalConv3DCase7(TestFunctionalConv3D):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -349,7 +345,6 @@ class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NCDHW"
 
 
@@ -364,7 +359,6 @@ class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "not_valid"
 
 
@@ -379,22 +373,6 @@ class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase6(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCDHW"
 
 
@@ -409,7 +387,6 @@ class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -424,7 +401,6 @@ class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -439,7 +415,6 @@ class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NCDHW"
 
 
@@ -454,7 +429,6 @@ class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NDHWC"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index f8e7818315fa077df4d8ad0d6d3f76b47501b5e9..7441f7cb915e8b1fdd2155fff79e145fb6a00c0f 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -38,7 +38,6 @@ class TestFunctionalConv3DTranspose(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def prepare(self):
@@ -90,7 +89,6 @@ class TestFunctionalConv3DTranspose(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -115,7 +113,7 @@ class TestFunctionalConv3DTranspose(TestCase):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
+                y = F.conv_transpose3d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -124,9 +122,9 @@ class TestFunctionalConv3DTranspose(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -140,7 +138,7 @@ class TestFunctionalConv3DTranspose(TestCase):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d_transpose(
+            y = F.conv_transpose3d(
                 x,
                 weight,
                 bias,
@@ -148,10 +146,10 @@ class TestFunctionalConv3DTranspose(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
             out = y.numpy()
         return out
 
@@ -190,7 +188,6 @@ class TestFunctionalConv3DTransposeError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def test_exception(self):
@@ -225,7 +222,7 @@ class TestFunctionalConv3DTransposeError(TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
+                y = F.conv_transpose3d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -234,9 +231,9 @@ class TestFunctionalConv3DTransposeError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
@@ -250,7 +247,6 @@ class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -265,7 +261,6 @@ class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -280,7 +275,6 @@ class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = True
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -295,7 +289,6 @@ class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -310,7 +303,6 @@ class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
         self.groups = 4
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NDHWC"
 
 
@@ -326,7 +318,6 @@ class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -341,7 +332,6 @@ class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -356,7 +346,6 @@ class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -371,7 +360,6 @@ class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -386,7 +374,6 @@ class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -402,7 +389,6 @@ class TestFunctionalConv3DTransposeErrorCase2(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -418,7 +404,6 @@ class TestFunctionalConv3DTransposeErrorCase3(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -434,7 +419,6 @@ class TestFunctionalConv3DTransposeErrorCase4(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -450,23 +434,6 @@ class TestFunctionalConv3DTransposeErrorCase5(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeErrorCase6(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCDHW"
 
 
@@ -483,7 +450,6 @@ class TestFunctionalConv3DTransposeErrorCase7(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -499,7 +465,6 @@ class TestFunctionalConv3DTransposeErrorCase8(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -515,7 +480,6 @@ class TestFunctionalConv3DTransposeErrorCase9(
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index 892f63bf15b742c51ddbc15262f888e43cdd03f3..bd934c76ebfa2ed7c9b11223b34c812e605ebe18 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -18,12 +18,11 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 class TestGatherNdOpWithEmptyIndex(OpTest):
-    """
-    Index has empty element, which means copy entire tensor
-    """
+    #Index has empty element, which means copy entire tensor
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -40,10 +39,22 @@ class TestGatherNdOpWithEmptyIndex(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestGatherNdOpWithIndex1(OpTest):
+    def setUp(self):
+        self.op_type = "gather_nd"
+        xnp = np.random.random((5, 20)).astype("float64")
+        self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestGatherNdOpWithLowIndex(OpTest):
-    """
-    Index has low rank, X has high rank
-    """
+    #Index has low rank, X has high rank
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -61,10 +72,27 @@ class TestGatherNdOpWithLowIndex(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestGatherNdOpIndex1(OpTest):
+    #Index has low rank, X has high rank
+
+    def setUp(self):
+        self.op_type = "gather_nd"
+        xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
+        index = np.array([1, 2]).astype("int64")
+
+        self.inputs = {'X': xnp, 'Index': index}
+
+        self.outputs = {'Out': xnp[tuple(index.T)]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestGatherNdOpWithSameIndexAsX(OpTest):
-    """
-    Index has same rank as X's rank
-    """
+    #Index has same rank as X's rank
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -82,9 +110,7 @@ class TestGatherNdOpWithSameIndexAsX(OpTest):
 
 
 class TestGatherNdOpWithHighRankSame(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) = Rank(X)
-    """
+    #Both Index and X have high rank, and Rank(Index) = Rank(X)
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -103,9 +129,7 @@ class TestGatherNdOpWithHighRankSame(OpTest):
 
 
 class TestGatherNdOpWithHighRankDiff(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) < Rank(X)
-    """
+    #Both Index and X have high rank, and Rank(Index) < Rank(X)
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -162,5 +186,63 @@ class TestGatherNdOpRaise(unittest.TestCase):
         self.assertRaises(IndexError, check_raise_is_test)
 
 
+class TestGatherNdError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='float32', name='x')
+            index = paddle.data(shape=shape, dtype='bool', name='index')
+            index_float = paddle.data(
+                shape=shape, dtype='float32', name='index_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_index = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.gather_nd(np_x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather_nd(x, np_index)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_index_dtype():
+                paddle.gather_nd(x, index_float)
+
+            self.assertRaises(TypeError, test_index_dtype)
+
+
+class TestGatherNdAPI2(unittest.TestCase):
+    def test_static(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.gather_nd(data1, index)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]])
+            index_1 = np.array([[1]])
+            result, = exe.run(feed={"data1": input,
+                                    "index": index_1},
+                              fetch_list=[out])
+            expected_output = np.array([[3, 4]])
+        self.assertTrue(np.allclose(result, expected_output))
+
+    def test_imperative(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([[1]])
+        input = fluid.dygraph.to_variable(input_1)
+        index = fluid.dygraph.to_variable(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([3, 4])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index f8763e731eeed3b36a6271167a57b9277479b5ba..1f6e522d2668b5dcd2075ff7af6b4b1ee674632d 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -21,6 +21,13 @@ import paddle
 import paddle.fluid as fluid
 
 
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
@@ -108,12 +115,80 @@ class TestCase6(TestGatherOp):
         self.index_type = "int32"
 
 
+class TestGatherOp1(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        axis_np = np.array(self.axis).astype(self.index_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        out = gather_numpy(xnp, index_np, axis_np[0])
+        self.inputs = {'X': xnp, 'Index': index_np, 'Axis': axis_np}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
+class TestGatherOp2(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+
+class TestGatherOp3(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [2]
+        self.axis_type = "int32"
+
+
+class TestGatherOp4(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 100, 10)
+        self.x_type = "float64"
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+
 class API_TestGather(unittest.TestCase):
-    def test_out(self):
+    def test_out1(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
-            index = fluid.layers.data('index', shape=[-1, 1], dtype='float64')
-            out = paddle.gather(data1, index)
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.fluid.layers.gather(data1, index)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input = np.array([[1, 2], [3, 4], [5, 6]])
@@ -124,18 +199,103 @@ class API_TestGather(unittest.TestCase):
             expected_output = np.array([[3, 4], [5, 6]])
         self.assertTrue(np.allclose(result, expected_output))
 
+    def test_out2(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data('x', shape=[-1, 2], dtype='float64')
+            index = paddle.data('index', shape=[-1, 1], dtype='int32')
+            axis = paddle.data('axis', shape=[1], dtype='int32')
+            out = paddle.gather(x, index, axis)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float64')
+            index_np = np.array([1, 1]).astype('int32')
+            axis_np = np.array([1]).astype('int32')
+            result, = exe.run(
+                feed={"x": x_np,
+                      "index": index_np,
+                      'axis': axis_np},
+                fetch_list=[out])
+            expected_output = gather_numpy(x_np, index_np, axis_np)
+        self.assertTrue(np.allclose(result, expected_output))
+
 
 class API_TestDygraphGather(unittest.TestCase):
-    def test_out(self):
-        with fluid.dygraph.guard():
-            input_1 = np.array([[1, 2], [3, 4], [5, 6]])
-            index_1 = np.array([1, 2])
-            input = fluid.dygraph.to_variable(input_1)
-            index = fluid.dygraph.to_variable(index_1)
-            output = paddle.fluid.layers.gather(input, index)
-            output_np = output.numpy()
-            expected_output = np.array([[3, 4], [5, 6]])
+    def test_out1(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([1, 2])
+        input = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([[3, 4], [5, 6]])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+    def test_out12(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([1, 2])
+        x = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.gather(x, index, axis=0)
+        output_np = output.numpy()
+        expected_output = gather_numpy(input_1, index_1, axis=0)
         self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='int8', name='x')
+            axis = paddle.data(shape=[1], dtype='float32', name='axis')
+            index = paddle.data(shape=shape, dtype='int32', name='index')
+            index_float = paddle.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=axis)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+    def test_error2(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+
+            shape = [8, 9, 6]
+            x = fluid.data(shape=shape, dtype='int8', name='x')
+            index = fluid.data(shape=shape, dtype='int32', name='mask')
+            index_float = fluid.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.fluid.layers.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.fluid.layers.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index a5d36203b0ad567a4ba25d686652c9384ea424bf..5054256ca72477785076cdff69266160f6c7d640 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -224,7 +224,8 @@ def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
 
 class TestGenerateProposalLabelsOp(OpTest):
     def set_data(self):
-        self.use_random = False
+        #self.use_random = False
+        self.init_use_random()
         self.init_test_cascade()
         self.init_test_params()
         self.init_test_input()
@@ -267,6 +268,9 @@ class TestGenerateProposalLabelsOp(OpTest):
     def init_test_cascade(self, ):
         self.is_cascade_rcnn = False
 
+    def init_use_random(self):
+        self.use_random = False
+
     def init_test_params(self):
         self.batch_size_per_im = 512
         self.fg_fraction = 0.25
@@ -329,6 +333,28 @@ class TestCascade(TestGenerateProposalLabelsOp):
         self.is_cascade_rcnn = True
 
 
+class TestUseRandom(TestGenerateProposalLabelsOp):
+    def init_use_random(self):
+        self.use_random = True
+        self.is_cascade_rcnn = False
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_out)
+
+    def verify_out(self, outs):
+        print("skip")
+
+    def init_test_params(self):
+        self.batch_size_per_im = 512
+        self.fg_fraction = 0.025
+        self.fg_thresh = 0.5
+        self.bg_thresh_hi = 0.5
+        self.bg_thresh_lo = 0.0
+        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
+        self.is_cls_agnostic = False
+        self.class_nums = 2 if self.is_cls_agnostic else 81
+
+
 class TestClsAgnostic(TestCascade):
     def init_test_params(self):
         self.batch_size_per_im = 512
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc43d3d5498284e8a24dd272eaed08cdf830733
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+import time  # temp for debug
+
+
+class TestGenerator(unittest.TestCase):
+    """
+    Test cases for cpu generator.
+    """
+
+    def test_basic_generator(self):
+        """Test basic generator."""
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+        s = gen.initial_seed()
+        s = gen.seed()
+        st = gen.get_state()
+        gen.set_state(st)
+        gen.random()
+        gen.set_cpu_engine(gen.get_cpu_engine())
+
+    def test_basic_generator_error(self):
+        self.assertRaises(ValueError, generator.Generator, device="CUDA")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 6660bfb0c747300741b305a101734e1ef808eeb5..4f0beb8c0dcd5384e7b9f6e30e8082595ac4dc06 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -124,14 +124,8 @@ class TestBase(unittest.TestCase):
                             label = item['label']
                             assert image.shape() == [BATCH_SIZE, 784]
                             assert label.shape() == [BATCH_SIZE, 1]
-                            if ps[i]._equals(fluid.CPUPlace()):
-                                assert image._place()._equals(fluid.CPUPlace())
-                                assert label._place()._equals(fluid.CPUPlace())
-                            else:
-                                assert image._place()._equals(
-                                    fluid.CUDAPinnedPlace())
-                                assert label._place()._equals(
-                                    fluid.CUDAPinnedPlace())
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
                         L, = exe.run(program=prog,
                                      feed=d,
                                      fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a33f32a0b6977716d8065419f8e0f88d6c4f44a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import unittest
+
+
+class GridSampleTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 x_shape=[2, 2, 3, 3],
+                 grid_shape=[2, 3, 3, 2],
+                 mode="bilinear",
+                 padding_mode="zeros",
+                 align_corners=False):
+        super(GridSampleTestCase, self).__init__(methodName)
+        self.padding_mode = padding_mode
+        self.x_shape = x_shape
+        self.grid_shape = grid_shape
+        self.mode = mode
+        self.padding_mode = padding_mode
+        self.align_corners = align_corners
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.x = np.random.randn(*(self.x_shape)).astype(self.dtype)
+        self.grid = np.random.uniform(-1, 1, self.grid_shape).astype(self.dtype)
+
+    def static_functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data("x", self.x_shape, dtype=self.dtype)
+                grid = fluid.data("grid", self.grid_shape, dtype=self.dtype)
+                y_var = F.grid_sample(
+                    x,
+                    grid,
+                    mode=self.mode,
+                    padding_mode=self.padding_mode,
+                    align_corners=self.align_corners)
+        feed_dict = {"x": self.x, "grid": self.grid}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def dynamic_functional(self):
+        x_t = paddle.to_tensor(self.x)
+        grid_t = paddle.to_tensor(self.grid)
+        y_t = F.grid_sample(
+            x_t,
+            grid_t,
+            mode=self.mode,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners)
+        y_np = y_t.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.static_functional(place)
+        with dg.guard(place):
+            result2 = self.dynamic_functional()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class GridSampleErrorTestCase(GridSampleTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with self.assertRaises(ValueError):
+            self.static_functional(place)
+
+
+def add_cases(suite):
+    suite.addTest(GridSampleTestCase(methodName='runTest'))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='reflect',
+            align_corners=True))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', align_corners="VALID"))
+    suite.addTest(GridSampleErrorTestCase(methodName='runTest', mode="VALID"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index bd5a07769e30de5110566f630de2d480e3426c77..4d1ed5aeb96ebbe064e35c1bee9d5775812440f7 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -17,17 +17,17 @@ import numpy as np
 from op_test import OpTest
 
 
-def AffineGrid(theta, size):
-    n = size[0]
-    h = size[2]
-    w = size[3]
+def AffineGrid(theta, grid_shape):
+    n = grid_shape[0]
+    h = grid_shape[1]
+    w = grid_shape[2]
     h_idx = np.repeat(
         np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
     w_idx = np.repeat(
         np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
     grid = np.concatenate(
         [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
-    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3
 
     ret = np.zeros([n, h * w, 2])
     theta = theta.transpose([0, 2, 1])
@@ -40,15 +40,19 @@ def AffineGrid(theta, size):
 def getGridPointValue(data, x, y):
     data_shape = data.shape
     N = data_shape[0]
-    H = data_shape[2]
-    W = data_shape[3]
-
-    out = np.zeros(data_shape, dtype='float64')
+    C = data_shape[1]
+    in_H = data_shape[2]
+    in_W = data_shape[3]
+    out_H = x.shape[1]
+    out_W = x.shape[2]
+
+    #out = np.zeros(data_shape, dtype='float64')
+    out = np.zeros([N, C, out_H, out_W], dtype='float64')
     for i in range(N):
-        for j in range(H):
-            for k in range(W):
-                if y[i, j, k] < 0 or y[i, j, k] > H - 1 or x[i, j, k] < 0 or x[
-                        i, j, k] > W - 1:
+        for j in range(out_H):
+            for k in range(out_W):
+                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
+                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
                     out[i, :, j, k] = 0
                 else:
                     out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -56,44 +60,89 @@ def getGridPointValue(data, x, y):
     return out
 
 
-def GridSampler(data, grid):
-    dims = data.shape
-    N = dims[0]
-    C = dims[1]
-    H = dims[2]
-    W = dims[3]
+def clip(x, min_n, max_n):
+    return np.maximum(np.minimum(x, max_n), min_n)
 
-    x = grid[:, :, :, 0]
-    y = grid[:, :, :, 1]
-    y_max = H - 1
-    x_max = W - 1
 
-    x = 0.5 * ((x.astype('float64') + 1.0) * x_max)
-    y = 0.5 * ((y.astype('float64') + 1.0) * y_max)
+def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
+    if align_corners:
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val)
+    else:
+        grid_slice = 0.5 * (
+            (grid_slice.astype('float64') + 1.0) * (max_val + 1)) - 0.5
+
+    if padding_mode == "border":
+        grid_slice = clip(grid_slice, 0, max_val)
+    elif padding_mode == "reflect":
+        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
+        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
+                                                                   0.5)
+        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
+        grid_slice = np.minimum(extra, double_range - extra)
+        grid_slice = grid_slice if align_corners else clip(grid_slice - 0.5, 0,
+                                                           max_val)
+    return grid_slice
 
-    x0 = np.floor(x).astype('int32')
-    x1 = x0 + 1
-    y0 = np.floor(y).astype('int32')
-    y1 = y0 + 1
 
-    wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
+def GridSampler(data,
+                grid,
+                align_corners=True,
+                mode="bilinear",
+                padding_mode="zeros"):
+    dims = data.shape
+    N = dims[0]
+    in_C = dims[1]
+    in_H = dims[2]
+    in_W = dims[3]
 
-    va = getGridPointValue(data, x0, y0)
-    vb = getGridPointValue(data, x0, y1)
-    vc = getGridPointValue(data, x1, y0)
-    vd = getGridPointValue(data, x1, y1)
+    out_H = grid.shape[1]
+    out_W = grid.shape[2]
 
-    out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    y_max = in_H - 1
+    x_max = in_W - 1
+
+    x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
+    y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
+
+    if mode == "bilinear":
+        x0 = np.floor(x).astype('int32')
+        x1 = x0 + 1
+        y0 = np.floor(y).astype('int32')
+        y1 = y0 + 1
+
+        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+
+        va = getGridPointValue(data, x0, y0)
+        vb = getGridPointValue(data, x0, y1)
+        vc = getGridPointValue(data, x1, y0)
+        vd = getGridPointValue(data, x1, y1)
+
+        out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    elif mode == "nearest":
+        x = np.round(x).astype('int32')
+        y = np.round(y).astype('int32')
+        out = getGridPointValue(data, x, y)
     return out
 
 
 class TestGridSamplerOp(OpTest):
     def setUp(self):
-        self.initTestCase()
+        self.use_cudnn = False
+        self.numeric_grad_delta = 0.0001
         self.op_type = 'grid_sampler'
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.initTestCase()
         x = np.random.randint(0, 255, self.x_shape).astype('float64')
 
         theta = np.zeros(self.theta_shape).astype('float64')
@@ -101,22 +150,90 @@ class TestGridSamplerOp(OpTest):
             for j in range(2):
                 for k in range(3):
                     theta[i, j, k] = np.random.rand(1)[0]
-        grid = AffineGrid(theta, self.x_shape)
+        grid = AffineGrid(theta, self.grid_shape)
 
         self.inputs = {'X': x, 'Grid': grid}
-        self.attrs = {'use_cudnn': True}
-        self.outputs = {'Output': GridSampler(x, grid)}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            "align_corners": self.align_corners,
+            "padding_mode": self.padding_mode,
+            "mode": self.mode
+        }
+        #    print("X: {}".format(x))
+        self.outputs = {
+            'Output': GridSampler(x, grid, self.align_corners, self.mode,
+                                  self.padding_mode)
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
+        self.check_grad(
+            ['X', 'Grid'],
+            'Output',
+            max_relative_error=0.01,
+            numeric_grad_delta=self.numeric_grad_delta)
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 8, 8)
+        self.grid_shape = (2, 7, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = True
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "border"
+        self.mode = "bilinear"
+
+
+class Case2(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
+
+class Case3(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
 
+class Case4(TestGridSamplerOp):
     def initTestCase(self):
-        self.x_shape = (2, 5, 7, 3)
-        self.grid_shape = (2, 7, 3, 2)
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "nearest"
+        self.numeric_grad_delta = 0.0001
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..654e8d6f129e1ffe0dce59113ca88a16d348f210
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphGroupNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
new file mode 100644
index 0000000000000000000000000000000000000000..430ed1abe860869d791f0eac17accc8416db1eca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest1(FSTestBase):
+    def test_timeout(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        src = "hdfs_test_timeout"
+        dst = "new_hdfs_test_timeout"
+        fs.delete(dst)
+        fs.mkdirs(src)
+        fs.mkdirs(dst)
+        fs.mkdirs(dst + "/" + src)
+        output = ""
+        try:
+            fs.mv(src, dst, test_exists=False)
+            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
+                                                                        output))
+        except FSTimeOut as e:
+            print("execute mv {} to {} timeout".format(src, dst))
+
+        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
+        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
+        self.assertNotEqual(ret, 0)
+        print("second mv ret:{} output:{}".format(ret, output))
+
+    def test_is_dir(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        self.assertFalse(fs.is_dir("./test_hdfs.py"))
+        s = """
+java.io.IOException: Input/output error
+ responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
+	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
+	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
+	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
+	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
+	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
+	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
+	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
+	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
+        """
+
+        print("split lines:", s.splitlines())
+        self.assertTrue(fs._test_match(s.splitlines()) != None)
+
+    def test_config(self):
+        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            config,
+            time_out=6 * 1000,
+            sleep_inter=100)
+
+    def test_exists(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
+        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
+        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
+        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs1.py"))
+        self.assertTrue(dirs == [])
+        self.assertTrue(len(files) == 1)
+        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7754f89e3c901ac14cb102881e8d338442038559
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest2(FSTestBase):
+    def test_hdfs(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=5 * 1000,
+            sleep_inter=100)
+        self._test_rm(fs)
+        self._test_touch(fs)
+        self._test_dirs(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_rm(fs)
+        self._test_touch(fs)
+        self._test_dirs(fs)
+
+        self._test_touch_file(fs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a045f4b17fc9b8b68ccf81a23cb953db58a9db7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest3(FSTestBase):
+    def test_hdfs(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=5 * 1000,
+            sleep_inter=100)
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+        self._test_try_upload(fs)
+        self._test_try_download(fs)
+
+        self._test_upload(fs)
+        self._test_download(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+        self._test_try_upload(fs)
+        self._test_try_download(fs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f3858d6fb242b8689bd1d300861faf8ed73e54
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -0,0 +1,273 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import six
+from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
+
+
+class SimpleConv(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(SimpleConv, self).__init__()
+        self._conv = fluid.dygraph.Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=None,
+            use_cudnn=True)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class TestAutoCast(unittest.TestCase):
+    def test_amp_guard_white_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp16 = conv2d(data)
+
+            with fluid.dygraph.amp_guard(False):
+                out_fp32 = conv2d(data)
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
+    def test_amp_guard_black_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp32 = fluid.layers.mean(data)
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
+    def test_custom_op_list(self):
+        with fluid.dygraph.guard():
+            tracer = fluid.framework._dygraph_tracer()
+            base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
+            base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST
+            with fluid.dygraph.amp_guard(
+                    custom_white_list=["log"], custom_black_list=["conv2d"]):
+                white_list, black_list = tracer._get_amp_op_list()
+                self.assertTrue(
+                    set(white_list) ==
+                    (set(base_white_list) | {"log"}) - {"conv2d"})
+
+                self.assertTrue(
+                    set(black_list) ==
+                    (set(base_black_list) - {"log"}) | {"conv2d"})
+
+    def test_custom_op_list_exception(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+
+        def func():
+            with fluid.dygraph.guard():
+                model = SimpleConv(
+                    num_channels=3,
+                    num_filters=64,
+                    filter_size=7,
+                    stride=2,
+                    act='relu')
+
+                with fluid.dygraph.amp_guard(
+                        custom_white_list=["conv2d"],
+                        custom_black_list=["conv2d"]):
+                    inp = fluid.dygraph.to_variable(inp_np)
+                    out = model(inp)
+
+        self.assertRaises(ValueError, func)
+
+
+class TestAmpScaler(unittest.TestCase):
+    def test_scale(self):
+        with fluid.dygraph.guard():
+            data = paddle.rand([10, 1024])
+            scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+            scaled_data = scaler.scale(data)
+            self.assertEqual(
+                np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True)
+
+    def test_minimize(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+
+        def run_simple_conv(inp_np, use_scaler=True):
+            paddle.manual_seed(10)
+            with fluid.dygraph.guard():
+                model = SimpleConv(
+                    num_channels=3,
+                    num_filters=64,
+                    filter_size=7,
+                    stride=2,
+                    act='relu')
+                optimizer = fluid.optimizer.SGDOptimizer(
+                    learning_rate=0.01, parameter_list=model.parameters())
+                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(inp_np)
+
+                out = model(data)
+                loss = fluid.layers.mean(out)
+                if use_scaler:
+                    print('use scaler')
+                    scaled_loss = scaler.scale(loss)
+                    scaled_loss.backward()
+                    optimize_ops, params_grads = scaler.minimize(optimizer,
+                                                                 scaled_loss)
+                else:
+                    print('use no scaler')
+                    loss.backward()
+                    optimize_ops, params_grads = optimizer.minimize(loss)
+            return optimize_ops, params_grads
+
+        outs_with_scaler = run_simple_conv(inp_np, use_scaler=True)
+        outs_no_scaler = run_simple_conv(inp_np, use_scaler=False)
+
+        self.assertEqual(outs_with_scaler[0],
+                         [])  # optimize_ops is [] in dygraph mode
+        self.assertEqual(outs_no_scaler[0],
+                         [])  # optimize_ops is [] in dygraph mode
+        for i in range(len(outs_with_scaler[1])):
+            # check each grad
+            self.assertEqual(
+                np.allclose(outs_with_scaler[1][i][1].numpy(),
+                            outs_no_scaler[1][i][1].numpy()), True)
+            # check each parameter
+            self.assertEqual(
+                np.allclose(outs_with_scaler[1][i][0].numpy(),
+                            outs_no_scaler[1][i][0].numpy()), True)
+
+    def test_nan_inf(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+        inp_np[0][1][2][3] = np.nan
+        with fluid.dygraph.guard():
+            model = SimpleConv(
+                num_channels=3,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            params_init = {}
+            for param in model.parameters():
+                params_init[param.name] = param.numpy()
+            optimizer = fluid.optimizer.SGDOptimizer(
+                learning_rate=0.01, parameter_list=model.parameters())
+            scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+            data = fluid.dygraph.to_variable(inp_np)
+
+            out = model(data)
+            loss = fluid.layers.mean(out)
+            scaled_loss = scaler.scale(loss)
+            scaled_loss.backward()
+            optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
+            self.assertEqual(scaler._found_inf.numpy() == 1, True)
+
+            for param in model.parameters():
+                # param not update when tensor contains nan or inf
+                self.assertTrue(
+                    np.array_equal(param.numpy(), params_init[param.name]))
+
+
+class TestResnet(unittest.TestCase):
+    def train_resnet(self, enable_amp=True):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+
+            resnet = ResNet(use_cudnn=True)
+            optimizer = optimizer_setting(
+                train_parameters, parameter_list=resnet.parameters())
+            np.random.seed(seed)
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            dy_param_init_value = {}
+            for param in resnet.parameters():
+                dy_param_init_value[param.name] = param.numpy()
+
+            program = None
+            scaler = paddle.fluid.dygraph.AmpScaler(
+                enable=enable_amp, init_loss_scaling=2.**10)
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                if len(np.array([x[1]
+                                 for x in data]).astype('int64')) != batch_size:
+                    continue
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    -1, 1)
+                img = fluid.dygraph.to_variable(dy_x_data)
+                label = fluid.dygraph.to_variable(y_data)
+                label.stop_gradient = True
+                with paddle.fluid.dygraph.amp_guard(enable=enable_amp):
+                    out = resnet(img)
+
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+
+                dy_out = avg_loss.numpy()
+
+                scaled_loss = scaler.scale(avg_loss)
+                scaled_loss.backward()
+
+                scaler.minimize(optimizer, scaled_loss)
+
+                dy_grad_value = {}
+                for param in resnet.parameters():
+                    if param.trainable:
+                        np_array = np.array(param._grad_ivar().value()
+                                            .get_tensor())
+                        dy_grad_value[param.name + fluid.core.grad_var_suffix(
+                        )] = np_array
+
+                resnet.clear_gradients()
+
+                dy_param_value = {}
+                for param in resnet.parameters():
+                    dy_param_value[param.name] = param.numpy()
+
+        return dy_out, dy_param_value, dy_grad_value
+
+    def test_resnet(self):
+        out_fp32 = self.train_resnet(enable_amp=False)
+        out_amp = self.train_resnet(enable_amp=True)
+        print(out_fp32[0], out_amp[0])
+        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 2a25bf6f8abade11d9ad25894753f6d17066e7fd..837e82882e9df8f50ca83a5df20ddf0f03ee504b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -238,8 +238,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            out.backward(backward_strategy)
+            out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
@@ -311,9 +310,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 9b6c307bbec5d272aa3c5644aeaabfe9d7f5df8f..b74182d27ab8c89cc43d3fc1656ca13916d159c1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -21,6 +21,7 @@ from paddle.fluid import core
 from paddle.fluid import Linear
 from test_imperative_base import new_program_scope
 import paddle.fluid.dygraph_utils as dygraph_utils
+from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
 import paddle
 
 
@@ -205,27 +206,28 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.array_equal(dy_grad1, dy_grad2))
 
     def test_functional_paddle_imperative_dygraph_context(self):
-        self.assertFalse(paddle.imperative.enabled())
-        paddle.enable_imperative()
-        self.assertTrue(paddle.imperative.enabled())
+        self.assertFalse(paddle.in_dynamic_mode())
+        paddle.disable_static()
+        self.assertTrue(paddle.in_dynamic_mode())
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.imperative.to_variable(np_inp)
+        var_inp = paddle.to_variable(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out1 = out.numpy()
         out.backward()
         dy_grad1 = mlp._linear1.weight.gradient()
-        paddle.disable_imperative()
-        self.assertFalse(paddle.imperative.enabled())
-        with paddle.imperative.guard():
-            self.assertTrue(paddle.imperative.enabled())
-            var_inp = paddle.imperative.to_variable(np_inp)
-            mlp = MLP(input_size=2)
-            out = mlp(var_inp)
-            dy_out2 = out.numpy()
-            out.backward()
-            dy_grad2 = mlp._linear1.weight.gradient()
-        self.assertFalse(paddle.imperative.enabled())
+        paddle.enable_static()
+        self.assertFalse(paddle.in_dynamic_mode())
+        paddle.disable_static()
+        self.assertTrue(paddle.in_dynamic_mode())
+        var_inp = paddle.to_variable(np_inp)
+        mlp = MLP(input_size=2)
+        out = mlp(var_inp)
+        dy_out2 = out.numpy()
+        out.backward()
+        dy_grad2 = mlp._linear1.weight.gradient()
+        paddle.enable_static()
+        self.assertFalse(paddle.in_dynamic_mode())
         self.assertTrue(np.array_equal(dy_out1, dy_out2))
         self.assertTrue(np.array_equal(dy_grad1, dy_grad2))
 
@@ -281,7 +283,7 @@ class TestImperative(unittest.TestCase):
             l0 = fluid.Linear(2, 2)
             self.assertTrue(l0.weight._grad_ivar() is None)
             l1 = fluid.Linear(2, 2)
-            with paddle.imperative.no_grad():
+            with paddle.no_grad():
                 self.assertTrue(l1.weight.stop_gradient is False)
                 tmp = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
@@ -312,9 +314,8 @@ class TestImperative(unittest.TestCase):
                 inputs2.append(tmp)
             ret2 = fluid.layers.sums(inputs2)
             loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            loss2.backward()
 
             self.assertTrue(np.allclose(ret.numpy(), x * 10))
             self.assertTrue(np.allclose(inputs[0].gradient(), x))
@@ -401,9 +402,8 @@ class TestImperative(unittest.TestCase):
             x2 = l2(var_inp2)[0]
             self.assertIsNotNone(x2)
             dy_out2 = x2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            x2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            x2.backward()
             dy_grad2 = l2._x_for_debug.gradient()
 
         with new_program_scope():
@@ -440,9 +440,8 @@ class TestImperative(unittest.TestCase):
             mlp2 = MLP(input_size=2)
             out2 = mlp2(var_inp2)
             dy_out2 = out2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            out2.backward()
             dy_grad2 = mlp2._linear1.weight.gradient()
 
         with new_program_scope():
@@ -550,9 +549,8 @@ class TestImperative(unittest.TestCase):
             simple_rnn2 = SimpleRNN()
             outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
             dy_out2 = outs2[3].numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            outs2[3].backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            outs2[3].backward()
             dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
             dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
             dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
@@ -628,6 +626,16 @@ class TestDygraphUtils(unittest.TestCase):
             res2 = fluid.layers.sigmoid(a)
             self.assertTrue(np.allclose(res1.numpy(), res2.numpy()))
 
+    def test_append_activation_in_dygraph3(self):
+        a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
+        helper = LayerObjectHelper(fluid.unique_name.generate("test"))
+        func = helper.append_activation
+        with fluid.dygraph.guard():
+            a = fluid.dygraph.to_variable(a_np)
+            res1 = func(a, act="sigmoid", use_cudnn=True)
+            res2 = fluid.layers.sigmoid(a)
+            self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
+
     def test_append_bias_in_dygraph_exception(self):
         with new_program_scope():
             np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
index 71b208e2cdd114ba527746d085cb066204c23777..4c9061dd83414219e7b251aafda1a21f92da2b7d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.reader import use_pinned_memory
 
 
 def get_random_images_and_labels(image_shape, label_shape):
@@ -77,6 +78,18 @@ class TestDygraphDataLoader(unittest.TestCase):
                 batch_size=self.batch_size)
             self.iter_loader_data(loader)
 
+    def test_set_pin_memory(self):
+        with fluid.dygraph.guard():
+            use_pinned_memory(False)
+            loader = fluid.io.DataLoader.from_generator(
+                capacity=self.capacity, iterable=False, use_multiprocess=False)
+            loader.set_sample_generator(
+                sample_generator_creator(self.batch_size, self.batch_num),
+                batch_size=self.batch_size,
+                places=fluid.CPUPlace())
+            self.iter_loader_data(loader)
+            use_pinned_memory(True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index a61950f2dc0775fcbad5fd970ee95ed5ebf1c558..d3f488d92ac455072b37274e2ce782bcf41e8cc7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
-            strategy = paddle.imperative.prepare_context()
+            strategy = paddle.prepare_context()
             mlp = MLP()
             parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
index 82e81d72f9a9823817355087d332c3d7fb1ffe5a..820206a3ce630eb92a36a154ca7cdec62de2ce34 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import unittest
@@ -27,7 +28,7 @@ class TestTracerMode(unittest.TestCase):
     def get_tracer_mode(self):
         assert fluid.in_dygraph_mode(), "Dygraph mode must be enabled"
 
-    @fluid.dygraph.no_grad
+    @paddle.no_grad()
     def no_grad_func(self, a):
         self.assertEqual(self.tracer._train_mode, False)
         return a
@@ -55,13 +56,32 @@ class TestTracerMode(unittest.TestCase):
             def need_no_grad_func(a, b=1):
                 return a + b
 
-            decorated_func = fluid.dygraph.no_grad(need_no_grad_func)
+            decorated_func = paddle.no_grad()(need_no_grad_func)
             self.assertTrue(
                 str(inspect.getargspec(decorated_func)) ==
                 str(inspect.getargspec(need_no_grad_func)))
 
             self.assertEqual(self.tracer._train_mode, self.init_mode)
 
+            def test_gen():
+                for i in range(3):
+                    yield i
+
+            a = 0
+            for i in test_gen():
+                a += i
+
+            @paddle.no_grad()
+            def test_wrapped_gen():
+                for i in range(3):
+                    yield i
+
+            b = 0
+            for i in test_wrapped_gen():
+                b += i
+
+            self.assertEqual(a, b)
+
         with fluid.dygraph.guard():
             self.check_not_support_rlt(False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index f76c3bd958081070939a85c390eeaeaa389ad5a4..af71d9d27b9a349e2b0e08c03dd04e3936d34afb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -275,8 +275,7 @@ class TestDygraphDeepCF(unittest.TestCase):
             deepcf2 = DeepCF(num_users, num_items, matrix)
             adam2 = fluid.optimizer.AdamOptimizer(
                 0.01, parameter_list=deepcf2.parameters())
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             for e in range(NUM_EPOCHES):
                 sys.stderr.write('epoch %d\n' % e)
                 for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
@@ -289,7 +288,7 @@ class TestDygraphDeepCF(unittest.TestCase):
                         fluid.layers.log_loss(prediction2,
                                               to_variable(labels_np[
                                                   slice:slice + BATCH_SIZE])))
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     adam2.minimize(loss2)
                     deepcf2.clear_gradients()
                     dy_loss2 = loss2.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 5c94f1836bf7354464bf9c21129cb14bdfaee160..227cd5d4acb290baeb622a84d729b01bc45d48b1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -52,8 +52,7 @@ class TestDygraphDoubleGrad(TestCase):
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = self.sort_sum_gradient
+        fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
         return fluid.dygraph.grad(
             outputs=outputs,
             inputs=inputs,
@@ -61,8 +60,7 @@ class TestDygraphDoubleGrad(TestCase):
             no_grad_vars=no_grad_vars,
             retain_graph=retain_graph,
             create_graph=create_graph,
-            allow_unused=allow_unused,
-            backward_strategy=backward_strategy)
+            allow_unused=allow_unused)
 
     @dygraph_guard
     def test_exception(self):
@@ -298,19 +296,20 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
     def test_compare(self):
-        value = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
+                                                          5).astype("float32")
 
         def model_f(input):
-            conv2d = fluid.dygraph.Conv2D(3, 2, 3)
+            linear = fluid.dygraph.Linear(5, 3, bias_attr=False)
             for i in range(10):
                 if i == 0:
-                    out = conv2d(input)
+                    out = linear(input)
                 else:
-                    out = out + conv2d(input)
+                    out = out + linear(input)
             return out
 
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = True
+        fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
             a = fluid.dygraph.to_variable(value)
@@ -321,11 +320,9 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
             dx = fluid.dygraph.grad(
                 outputs=[out],
                 inputs=[a],
-                create_graph=True,
-                retain_graph=True,
+                create_graph=False,
                 only_inputs=True,
-                allow_unused=False,
-                backward_strategy=backward_strategy)
+                allow_unused=False)
 
             grad_1 = dx[0].numpy()
 
@@ -335,13 +332,29 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
             a.stop_gradient = False
 
             out = model_f(a)
-            out.backward(backward_strategy)
+            out.backward()
 
             grad_2 = a.gradient()
 
-        self.assertTrue(
-            np.allclose(
-                grad_1, grad_2, rtol=1.e-5, atol=1.e-8, equal_nan=True))
+        self.assertTrue(np.array_equal(grad_1, grad_2))
+
+
+class TestRaiseNoDoubleGradOp(TestCase):
+    def raise_no_grad_op(self):
+        with fluid.dygraph.guard():
+            x = fluid.layers.ones(shape=[2, 3, 2, 2], dtype='float32')
+            x.stop_gradient = False
+            y = paddle.fluid.layers.batch_norm(x)
+
+            dx = fluid.dygraph.grad(
+                outputs=[y], inputs=[x], create_graph=True,
+                retain_graph=True)[0]
+
+            loss = fluid.layers.reduce_mean(dx)
+            loss.backward()
+
+    def test_raise(self):
+        self.assertRaises(fluid.core.EnforceNotMet, self.raise_no_grad_op)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index b7ebd23a0b74208e768ea4e67b69dc4a596c6764..80bdf2ea8a898716fa20be315ac57371191b1a61 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -179,9 +179,8 @@ class TestDygraphGAN(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
             discriminator2 = Discriminator()
             generator2 = Generator()
             sgd2 = SGDOptimizer(
@@ -201,7 +200,7 @@ class TestDygraphGAN(unittest.TestCase):
                     x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
 
             d_loss2 = d_loss_real2 + d_loss_fake2
-            d_loss2.backward(backward_strategy)
+            d_loss2.backward()
             sgd2.minimize(d_loss2)
             discriminator2.clear_gradients()
             generator2.clear_gradients()
@@ -211,7 +210,7 @@ class TestDygraphGAN(unittest.TestCase):
             g_loss2 = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss2.backward(backward_strategy)
+            g_loss2.backward()
             sgd2.minimize(g_loss2)
             for p in discriminator2.parameters():
                 dy_params2[p.name] = p.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
index 4fe4d963ca5ee4cff1e7073d11361de69e68aa9f..317353684317f6fa0e8cf37cda58f2041e70befd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
@@ -62,8 +62,7 @@ class Test_Forward_Hook(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
-                backward_strategy = fluid.dygraph.BackwardStrategy()
-                backward_strategy.sort_sum_gradient = True
+                fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
                 input_word = np.array(
                     [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7,
@@ -132,8 +131,7 @@ class Test_Forward_Hook(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
-                backward_strategy = fluid.dygraph.BackwardStrategy()
-                backward_strategy.sort_sum_gradient = True
+                fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
                 global call_forward_hook
                 global call_forward_pre_hook
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index a391c088a3640c097ff0f4ff714bf50470c575c6..f61d1ab888a51b2ebe4d1205b30fb84dfa4e7aeb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -28,11 +28,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
-            nn.Conv2D(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2),
-            nn.Conv2D(
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2))
@@ -40,9 +40,8 @@ class LeNetDygraph(fluid.dygraph.Layer):
         if num_classes > 0:
             self.fc = nn.Sequential(
                 nn.Linear(400, 120),
-                nn.Linear(120, 84),
-                nn.Linear(
-                    84, 10, act=classifier_activation))
+                nn.Linear(120, 84), nn.Linear(84, 10),
+                nn.Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -61,7 +60,7 @@ def init_weights(layer):
         new_bias = paddle.fill_constant(
             layer.bias.shape, layer.bias.dtype, value=-0.1)
         layer.bias.set_value(new_bias)
-    elif type(layer) == nn.Conv2D:
+    elif type(layer) == nn.Conv2d:
         new_weight = paddle.fill_constant(
             layer.weight.shape, layer.weight.dtype, value=0.7)
         layer.weight.set_value(new_weight)
@@ -81,7 +80,7 @@ class TestLayerApply(unittest.TestCase):
                 if type(layer) == nn.Linear:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.9)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.1)
-                elif type(layer) == nn.Conv2D:
+                elif type(layer) == nn.Conv2d:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.7)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7e0902341a59649219cf94ef9741fdf7ae09233
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+
+import numpy as np
+
+
+class LeNetDygraph(fluid.dygraph.Layer):
+    def __init__(self):
+        super(LeNetDygraph, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Pool2D(2, 'max', 2),
+            nn.Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            nn.ReLU(),
+            nn.Pool2D(2, 'max', 2))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        return x
+
+
+class TestLayerChildren(unittest.TestCase):
+    def test_apply_init_weight(self):
+        with fluid.dygraph.guard():
+            net = LeNetDygraph()
+            net.eval()
+
+            net_layers = nn.Sequential(*list(net.children()))
+            net_layers.eval()
+
+            x = paddle.rand([2, 1, 28, 28])
+
+            y1 = net(x)
+            y2 = net_layers(x)
+
+            np.testing.assert_allclose(y1.numpy(), y2.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 69fd7d80327f1a666870dc76e041449366565b01..6349d71760934c9da3aed4896ea651c45af657ad 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -113,8 +113,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss = None
 
                     helper = DyGraphProgramDescTracerTestHelper(self)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
+                    })
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -129,7 +130,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
                         if i == 0:
                             for param in simple_net.parameters():
                                 dy_param_init[param.name] = param.numpy()
-                        dy_loss.backward(backward_strategy)
+                        dy_loss.backward()
                         sgd.minimize(dy_loss)
                         sgd.clear_gradients()
                         if i == batch_num - 1:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index a3c602646b700556cea53a9b06295e38baf705dd..1e509960c076339d2d56ccfcdd7a795fa462ca82 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -153,7 +153,7 @@ class TestImperativeMnist(unittest.TestCase):
                     label.stop_gradient = True
 
                     if batch_id % 10 == 0:
-                        cost, traced_layer = paddle.imperative.TracedLayer.trace(
+                        cost, traced_layer = paddle.jit.TracedLayer.trace(
                             mnist, inputs=img)
                         if program is not None:
                             self.assertTrue(program, traced_layer.program)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index 4ce0ca350ddb9e8b9873a1650eefa1d5b2db4938..bda1958c0f3544bef51e51cf418ae6c07bdd7056 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -36,8 +36,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist2 = MNIST()
             sgd2 = SGDOptimizer(
@@ -69,7 +68,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
                         for param in mnist2.parameters():
                             dy_param_init_value2[param.name] = param.numpy()
 
-                    avg_loss2.backward(backward_strategy)
+                    avg_loss2.backward()
                     sgd2.minimize(avg_loss2)
                     mnist2.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 246b013f1ada6bc853711e146379b8bb2df5e363..499a4311f6e1714b239259d68217370edea20a2f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -403,8 +403,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             ocr_attention = OCRAttention()
 
             if Config.learning_rate_decay == "piecewise_decay":
@@ -438,7 +437,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
                         for param in ocr_attention.parameters():
                             if param.name not in dy_param_init_value:
                                 dy_param_init_value[param.name] = param.numpy()
-                    avg_loss.backward(backward_strategy)
+                    avg_loss.backward()
                     dy_grad_value = {}
                     for param in ocr_attention.parameters():
                         if param.trainable:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec331e2e5b3b8ab541d4075852f5ecfe0300e28e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import itertools
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
+from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
+from paddle.fluid.dygraph import Linear
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+
+# Note(wangzhongpu)
+# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.
+
+
+class MLP(fluid.Layer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._fc1 = Linear(784, 10)
+        self._fc2 = Linear(10, 10)
+
+    def forward(self, inputs):
+        y = self._fc1(inputs)
+        y = self._fc2(y)
+        return y
+
+
+class TestImperativeOptimizerBase(unittest.TestCase):
+    def setUp(self):
+        self.batch_num = 20
+
+    def get_optimizer_dygraph(self, parameter_list):
+        raise NotImplementedError()
+
+    def get_optimizer(self):
+        raise NotImplementedError()
+
+    def reader_decorator(self, reader):
+        def _reader_imple():
+            for item in reader():
+                image = np.array(item[0]).reshape(1, 784)
+                label = np.array(item[1]).astype('int64').reshape(1)
+                yield image, label
+
+        return _reader_imple
+
+    def _check_exception(self, exception_message, place=None):
+        seed = 90
+        batch_size = 128
+        if place == None:
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+
+        with fluid.dygraph.guard(place):
+            try:
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+                mlp = MLP()
+                optimizer = self.get_optimizer_dygraph(
+                    parameter_list=mlp.parameters())
+            except Exception as e:
+                assert str(e) == exception_message
+
+    def _check_mlp(self, place=None):
+        seed = 90
+        batch_size = 128
+
+        if place == None:
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+
+        with fluid.dygraph.guard(place):
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            mlp = MLP()
+            optimizer = self.get_optimizer_dygraph(
+                parameter_list=mlp.parameters())
+
+            batch_py_reader = fluid.io.PyReader(capacity=1)
+            batch_py_reader.decorate_sample_list_generator(
+                paddle.batch(
+                    self.reader_decorator(paddle.dataset.mnist.train()),
+                    batch_size=batch_size,
+                    drop_last=True),
+                places=fluid.CPUPlace())
+
+            dy_param_init_value = {}
+            for batch_id, data in enumerate(batch_py_reader()):
+                if batch_id >= self.batch_num:
+                    break
+
+                img = data[0]
+                label = data[1]
+                label.stop_gradient = True
+
+                img = fluid.layers.reshape(img, shape=[batch_size, -1])
+                cost = mlp(img)
+                avg_loss = fluid.layers.reduce_mean(cost)
+                dy_out = avg_loss.numpy()
+
+                if batch_id == 0:
+                    for param in mlp.parameters():
+                        dy_param_init_value[param.name] = param.numpy()
+
+                avg_loss.backward()
+                optimizer.minimize(avg_loss)
+                mlp.clear_gradients()
+                dy_param_value = {}
+                for param in mlp.parameters():
+                    dy_param_value[param.name] = param.numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            if place == None:
+                place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+                ) else fluid.CUDAPlace(0)
+
+            exe = fluid.Executor(place)
+
+            mlp = MLP()
+            optimizer = self.get_optimizer()
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            img = fluid.layers.reshape(img, shape=[batch_size, 784])
+            cost = mlp(img)
+            avg_loss = fluid.layers.reduce_mean(cost)
+            optimizer.minimize(avg_loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mlp.parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= self.batch_num:
+                    break
+
+                static_x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [128, 1])
+
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_out = out[0]
+                for i in range(1, len(out)):
+                    static_param_value[static_param_name_list[i - 1]] = out[i]
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+
+
+class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(
+            learning_rate=paddle.optimizer.PiecewiseLR(
+                boundaries=bd,
+                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR(
+            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.natural_exp_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = Adam(
+            learning_rate=fluid.layers.inverse_time_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_adam(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.polynomial_decay(
+                learning_rate=0.1, decay_steps=5, cycle=self.cycle),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
+            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
+        return optimizer
+
+    def test_sgd_cycle(self):
+        self.cycle = True
+        self._check_mlp()
+
+    def test_sgd(self):
+        self.cycle = False
+        self._check_mlp()
+
+
+class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.cosine_decay(
+                learning_rate=0.1, step_each_epoch=10000, epochs=120),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
+            learning_rate=0.1, step_each_epoch=10000, epochs=120))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.noam_decay(
+                d_model=512, warmup_steps=8000),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
+            d_model=512, warmup_steps=8000))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestOptimizerLearningRate(unittest.TestCase):
+    def test_constant_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 0.001, rtol=1e-06, atol=0.0))
+
+            for i in range(10):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+
+                self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0))
+
+    def test_lr_decay(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            bd = [2, 4, 6, 8]
+            value = [0.2, 0.4, 0.6, 0.8, 1.0]
+
+            scheduler = paddle.optimizer.PiecewiseLR(bd, value)
+            adam = paddle.optimizer.Adam(
+                scheduler, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 0.2, rtol=1e-06, atol=0.0))
+
+            ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
+            for i in range(12):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+                scheduler.step()
+
+    def test_lr_decay_natural_exp(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+            base_lr = 1.0
+
+            scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5)
+            print("scheduler.last_lr", scheduler.last_lr)
+            adam = paddle.optimizer.Adam(
+                scheduler, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
+
+            ret = [1.0, np.exp(-0.5), np.exp(-1)]
+            for i in range(3):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+                scheduler.step()
+
+    def test_set_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+            for i in range(5):
+                adam.set_lr(lr_list[i])
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(
+                    np.allclose(
+                        lr, lr_list[i], rtol=1e-06, atol=0.0))
+
+            with self.assertRaises(TypeError):
+                lr_var = fluid.layers.create_global_var(
+                    shape=[1], value=0.7, dtype='float32')
+                adam.set_lr(lr_var)
+
+            with self.assertRaises(RuntimeError):
+                adam = paddle.optimizer.Adam(
+                    paddle.optimizer.NaturalExpLR(
+                        learning_rate=0.1, gamma=0.5),
+                    parameters=linear.parameters())
+                adam.set_lr(0.01)
+
+
+class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        return optimizer
+
+    def test_momentum(self):
+        self._check_mlp()
+
+
+class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = LarsMomentumOptimizer(
+            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        return optimizer
+
+    def test_larsmomentum(self):
+        self._check_mlp()
+
+
+class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdagradOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdagradOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_adagrad(self):
+        self._check_mlp()
+
+
+class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdamaxOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdamaxOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_adamax(self):
+        self._check_mlp()
+
+
+class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DpsgdOptimizer(
+            learning_rate=0.01,
+            clip=10.0,
+            batch_size=16.0,
+            sigma=1.0,
+            parameter_list=parameter_list)
+        optimizer._seed = 100
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = DpsgdOptimizer(
+            learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
+        optimizer._seed = 100
+        return optimizer
+
+    def test_dpsgd(self):
+        self._check_mlp(place=fluid.CPUPlace())
+
+
+class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DecayedAdagradOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = DecayedAdagradOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_decayadagrad(self):
+        self._check_mlp()
+
+
+class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdadeltaOptimizer(
+            learning_rate=0.0003,
+            epsilon=1.0e-6,
+            rho=0.95,
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdadeltaOptimizer(
+            learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+        return optimizer
+
+    def test_adadelta(self):
+        self._check_mlp()
+
+
+class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = RMSPropOptimizer(
+            learning_rate=0.1, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = RMSPropOptimizer(learning_rate=0.1)
+        return optimizer
+
+    def test_rmsprop(self):
+        self._check_mlp()
+
+
+class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = FtrlOptimizer(
+            learning_rate=0.1, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = FtrlOptimizer(learning_rate=0.1)
+        return optimizer
+
+    def test_ftrl(self):
+        self._check_mlp()
+
+
+def exclude_fn(param):
+    return param.name.endswith('.b_0')
+
+
+class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = LambOptimizer(
+            learning_rate=0.002,
+            exclude_from_weight_decay_fn=exclude_fn,
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = LambOptimizer(
+            learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn)
+        return optimizer
+
+    def test_lamb(self):
+        self._check_mlp()
+
+
+class TestImperativeModelAverage(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = ModelAverage(
+            0.15, min_average_window=10000, max_average_window=12500)
+        return optimizer
+
+    def test_modelaverage(self):
+        exception_message = "In dygraph, don't support ModelAverage."
+        self._check_exception(exception_message)
+
+
+class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DGCMomentumOptimizer(
+            learning_rate=0.0001,
+            momentum=0.9,
+            rampup_step=1000,
+            rampup_begin_step=1252,
+            sparsity=[0.999, 0.999])
+        return optimizer
+
+    def test_dgcmomentum(self):
+        exception_message = "In dygraph, don't support DGCMomentumOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = ExponentialMovingAverage(0.999)
+        return optimizer
+
+    def test_exponentialmoving(self):
+        exception_message = "In dygraph, don't support ExponentialMovingAverage."
+        self._check_exception(exception_message)
+
+
+class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = PipelineOptimizer(optimizer)
+        return optimizer
+
+    def test_pipline(self):
+        exception_message = "In dygraph, don't support PipelineOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
+        return optimizer
+
+    def test_lookahead(self):
+        exception_message = "In dygraph, don't support LookaheadOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = RecomputeOptimizer(optimizer)
+        return optimizer
+
+    def test_recompute(self):
+        exception_message = "In dygraph, don't support RecomputeOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeOptimizerList(unittest.TestCase):
+    def test_parameter_list(self):
+        with fluid.dygraph.guard():
+            linear_1 = Linear(10, 10)
+            linear_2 = Linear(10, 10)
+
+            sgd = SGDOptimizer(
+                1.0,
+                parameter_list=itertools.chain(linear_1.parameters(),
+                                               linear_2.parameters()))
+
+            in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            in_data = fluid.dygraph.to_variable(in_np)
+
+            y = linear_1(in_data)
+            y = linear_2(y)
+            loss = fluid.layers.reduce_mean(y)
+            loss.backward()
+            sgd.minimize(loss)
+
+            self.assertTrue(
+                len(sgd._parameter_list) ==
+                len(linear_1.parameters() + linear_2.parameters()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index 8e85fe5dfefea3221fe0566ac506b1277263eec2..526c1706e2d08bdf779846a6f30706435eb4a503 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -45,8 +45,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -82,7 +81,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
-                dy_loss.backward(backward_strategy)
+                dy_loss.backward()
                 sgd.minimize(dy_loss)
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 106f58ccc99ffe42b77466e6dbf7b773ecee4ee2..815437072fde291b8d8348dba0b4b0ae872ec1b9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -83,7 +83,8 @@ class ConvBNLayer(fluid.Layer):
                  filter_size,
                  stride=1,
                  groups=1,
-                 act=None):
+                 act=None,
+                 use_cudnn=False):
         super(ConvBNLayer, self).__init__()
 
         self._conv = Conv2D(
@@ -94,8 +95,8 @@ class ConvBNLayer(fluid.Layer):
             padding=(filter_size - 1) // 2,
             groups=groups,
             act=None,
-            bias_attr=None,
-            use_cudnn=False)
+            bias_attr=False,
+            use_cudnn=use_cudnn)
 
         self._batch_norm = BatchNorm(num_filters, act=act)
 
@@ -107,32 +108,41 @@ class ConvBNLayer(fluid.Layer):
 
 
 class BottleneckBlock(fluid.Layer):
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 use_cudnn=False):
         super(BottleneckBlock, self).__init__()
 
         self.conv0 = ConvBNLayer(
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
-            act='relu')
+            act='relu',
+            use_cudnn=use_cudnn)
         self.conv1 = ConvBNLayer(
             num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
-            act='relu')
+            act='relu',
+            use_cudnn=use_cudnn)
         self.conv2 = ConvBNLayer(
             num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
-            act=None)
+            act=None,
+            use_cudnn=use_cudnn)
 
         if not shortcut:
             self.short = ConvBNLayer(
                 num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
-                stride=stride)
+                stride=stride,
+                use_cudnn=use_cudnn)
 
         self.shortcut = shortcut
 
@@ -153,7 +163,7 @@ class BottleneckBlock(fluid.Layer):
 
 
 class ResNet(fluid.Layer):
-    def __init__(self, layers=50, class_dim=102):
+    def __init__(self, layers=50, class_dim=102, use_cudnn=False):
         super(ResNet, self).__init__()
 
         self.layers = layers
@@ -171,7 +181,12 @@ class ResNet(fluid.Layer):
         num_filters = [64, 128, 256, 512]
 
         self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            use_cudnn=use_cudnn)
         self.pool2d_max = Pool2D(
             pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
 
@@ -186,7 +201,8 @@ class ResNet(fluid.Layer):
                         if i == 0 else num_filters[block] * 4,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut))
+                        shortcut=shortcut,
+                        use_cudnn=use_cudnn))
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 8cbd08ea3e245f70a6a4aceb3f6c9e0b83356981..d26d6f25aa8ffbbde3af9148bebba156eeef5e38 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -79,8 +79,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             resnet = ResNet()
             optimizer = optimizer_setting(
                 train_parameters, parameter_list=resnet.parameters())
@@ -119,7 +118,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
                         if param.name not in dy_param_init_value:
                             dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss.backward(backward_strategy)
+                avg_loss.backward()
 
                 dy_grad_value = {}
                 for param in resnet.parameters():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 927e51b56d727f92b75930eb0915fb5da8931f01..a2f75089102ebbcdd2753c8fdad6653b511919bd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -292,7 +292,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.imperative.save(self.state_dict, "./test_dy")
+            paddle.save(self.state_dict, "./test_dy")
 
     def testLoadAndSetVarBase(self):
         seed = 90
@@ -373,8 +373,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
-                "./test_dy")
+            para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            print(opti_state_dict['LR_Scheduler'])
             adam.set_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -900,18 +900,17 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.imperative.save(state_dict,
-                                   os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
+            para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy'))
 
             self.assertTrue(opti_state_dict == None)
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
+            para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
+            para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1e7fc2b1d3faeceaa90ac03c12819108be2bc38
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -0,0 +1,922 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.dygraph.nn import Embedding, Linear
+import paddle.fluid.framework as framework
+from paddle.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+from test_imperative_base import new_program_scope
+import numpy as np
+import six
+import paddle
+
+
+class SimpleLSTMRNN(fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_steps,
+                 num_layers=2,
+                 init_scale=0.1,
+                 dropout=None):
+        super(SimpleLSTMRNN, self).__init__()
+        self._hidden_size = hidden_size
+        self._num_layers = num_layers
+        self._init_scale = init_scale
+        self._dropout = dropout
+        self._input = None
+        self._num_steps = num_steps
+        self.cell_array = []
+        self.hidden_array = []
+        self.weight_1_arr = []
+        self.weight_2_arr = []
+        self.bias_arr = []
+        self.mask_array = []
+
+        for i in range(self._num_layers):
+            weight_1 = self.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 2, self._hidden_size * 4],
+                dtype="float32",
+                default_initializer=fluid.initializer.UniformInitializer(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
+            bias_1 = self.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 4],
+                dtype="float32",
+                default_initializer=fluid.initializer.Constant(0.0))
+            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
+
+    def forward(self, input_embedding, init_hidden=None, init_cell=None):
+        self.cell_array = []
+        self.hidden_array = []
+
+        for i in range(self._num_layers):
+            pre_hidden = fluid.layers.slice(
+                init_hidden, axes=[0], starts=[i], ends=[i + 1])
+            pre_cell = fluid.layers.slice(
+                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(
+                pre_hidden, shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(
+                pre_cell, shape=[-1, self._hidden_size])
+            self.hidden_array.append(pre_hidden)
+            self.cell_array.append(pre_cell)
+
+        res = []
+        for index in range(self._num_steps):
+            self._input = fluid.layers.slice(
+                input_embedding, axes=[1], starts=[index], ends=[index + 1])
+            self._input = fluid.layers.reshape(
+                self._input, shape=[-1, self._hidden_size])
+            for k in range(self._num_layers):
+                pre_hidden = self.hidden_array[k]
+                pre_cell = self.cell_array[k]
+                weight_1 = self.weight_1_arr[k]
+                bias = self.bias_arr[k]
+
+                nn = fluid.layers.concat([self._input, pre_hidden], 1)
+                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+
+                gate_input = fluid.layers.elementwise_add(gate_input, bias)
+                i, j, f, o = fluid.layers.split(
+                    gate_input, num_or_sections=4, dim=-1)
+                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                    i) * fluid.layers.tanh(j)
+                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                self.hidden_array[k] = m
+                self.cell_array[k] = c
+                self._input = m
+
+                if self._dropout is not None and self._dropout > 0.0:
+                    self._input = fluid.layers.dropout(
+                        self._input,
+                        dropout_prob=self._dropout,
+                        dropout_implementation='upscale_in_train')
+            res.append(
+                fluid.layers.reshape(
+                    self._input, shape=[1, -1, self._hidden_size]))
+        real_res = fluid.layers.concat(res, 0)
+        real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
+        last_hidden = fluid.layers.concat(self.hidden_array, 1)
+        last_hidden = fluid.layers.reshape(
+            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
+        last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
+        last_cell = fluid.layers.concat(self.cell_array, 1)
+        last_cell = fluid.layers.reshape(
+            last_cell, shape=[-1, self._num_layers, self._hidden_size])
+        last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
+        return real_res, last_hidden, last_cell
+
+
+class PtbModel(fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 num_layers=2,
+                 num_steps=20,
+                 init_scale=0.1,
+                 dropout=None):
+        super(PtbModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.init_scale = init_scale
+        self.num_layers = num_layers
+        self.num_steps = num_steps
+        self.dropout = dropout
+        self.simple_lstm_rnn = SimpleLSTMRNN(
+            hidden_size,
+            num_steps,
+            num_layers=num_layers,
+            init_scale=init_scale,
+            dropout=dropout)
+        self.embedding = Embedding(
+            size=[vocab_size, hidden_size],
+            dtype='float32',
+            is_sparse=False,
+            param_attr=fluid.ParamAttr(
+                name='embedding_para',
+                initializer=fluid.initializer.UniformInitializer(
+                    low=-init_scale, high=init_scale)))
+
+        self.softmax_weight = self.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.hidden_size, self.vocab_size],
+            dtype="float32",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+        self.softmax_bias = self.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.vocab_size],
+            dtype="float32",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+
+    def forward(self, input, label, init_hidden, init_cell):
+        init_h = fluid.layers.reshape(
+            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
+
+        init_c = fluid.layers.reshape(
+            init_cell, shape=[self.num_layers, -1, self.hidden_size])
+
+        x_emb = self.embedding(input)
+        x_emb = fluid.layers.reshape(
+            x_emb, shape=[-1, self.num_steps, self.hidden_size])
+        if self.dropout is not None and self.dropout > 0.0:
+            x_emb = fluid.layers.dropout(
+                x_emb,
+                dropout_prob=self.drop_out,
+                dropout_implementation='upscale_in_train')
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
+                                                               init_c)
+        rnn_out = fluid.layers.reshape(
+            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
+
+        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
+        projection = fluid.layers.reshape(
+            projection, shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=label, soft_label=False)
+        loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
+        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = fluid.layers.reduce_sum(loss)
+
+        return loss, last_hidden, last_cell
+
+
+class TestDygraphPtbRnn(unittest.TestCase):
+    def setUp(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            self.opti_dict = adam.state_dict()
+            self.base_opti = {}
+            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.base_opti[v.name] = v.numpy()
+                    self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
+                else:
+                    self.base_opti[k] = v
+
+            fluid.save_dygraph(self.opti_dict, "./test_dy_v2")
+
+            self.state_dict = ptb_model.state_dict()
+
+            self.model_base = {}
+            for k, v in self.state_dict.items():
+                np_t = v.numpy()
+                self.model_base[k] = np_t
+
+            paddle.save(self.state_dict, "./test_dy_v2")
+
+    def testLoadAndSetVarBase(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+
+            para_state_dict, opti_state_dict = paddle.load("./test_dy_v2")
+            adam.set_state_dict(opti_state_dict)
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(para_state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetVariable(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            adam.set_state_dict(self.opti_dict)
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(self.state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetNumpy(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler, parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                scheduler.step()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            np_opti_dict = {}
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    np_opti_dict[v.name] = np_t
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                else:
+                    np_opti_dict[k] = v
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            adam.set_state_dict(np_opti_dict)
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            np_state_dict = {}
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                np_state_dict[k] = np_t
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(np_state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetVariableBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=0.0,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            adam.set_state_dict(self.opti_dict)
+            ptb_model.set_dict(self.state_dict)
+
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testLoadAndSetVarBaseBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [0.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                # set lr to zero not update parameter
+                new_lr = 0.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=0.0,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            state_dict, opti_dict = fluid.load_dygraph("./test_dy_v2")
+            adam.set_state_dict(opti_dict)
+            ptb_model.set_dict(state_dict)
+
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            # check parameter
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetNumpyBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [0.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                # set lr to 0.0, not update parameter
+                new_lr = 0.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
+            adam = Adam(
+                learning_rate=scheduler,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            np_opti_dict = {}
+            np_state_dict = {}
+
+            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_opti_dict[v.name] = v.numpy()
+                else:
+                    np_opti_dict[k] = v
+
+            for k, v in self.state_dict.items():
+                np_state_dict[k] = v.numpy()
+
+            adam.set_state_dict(np_opti_dict)
+            ptb_model.set_dict(np_state_dict)
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                scheduler.step()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "LR_Scheduler":
+                    self.assertTrue(
+                        np.array_equal(v['last_epoch'], self.base_opti[k][
+                            'last_epoch'] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            # check parameter
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testOnlyLoadParams(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'))
+
+            self.assertTrue(opti_state_dict == None)
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdopt'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 2789174ba7a5805b86557a9a465c661a906bc0a7..59ddb365e539603c1eba06ca8828fc244b6e542d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -47,33 +47,35 @@ class TestSimpleNet(unittest.TestCase):
         for place in places:
             for dtype in ["float32", "float64"]:
                 for sort_sum_gradient in [True, False]:
-                    with paddle.imperative.guard(place):
-                        backward_strategy = paddle.imperative.BackwardStrategy()
-                        backward_strategy.sort_sum_gradient = sort_sum_gradient
-                        # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    paddle.disable_static(place)
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': sort_sum_gradient
+                    })
+                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
-                        input_word = np.array([[1, 2], [2, 1]]).astype('int64')
-                        input = paddle.imperative.to_variable(input_word)
+                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
+                    input = paddle.to_variable(input_word)
 
-                        simplenet = SimpleNet(20, 32, dtype)
-                        adam = SGDOptimizer(
-                            learning_rate=0.001,
-                            parameter_list=simplenet.parameters(
-                            ))  # grad_clip=grad_clip
-                        input_emb, emb = simplenet(input)
+                    simplenet = SimpleNet(20, 32, dtype)
+                    adam = SGDOptimizer(
+                        learning_rate=0.001,
+                        parameter_list=simplenet.parameters(
+                        ))  # grad_clip=grad_clip
+                    input_emb, emb = simplenet(input)
 
-                        self.assertTrue(emb.weight.gradient() is None)
-                        self.assertTrue(input_emb.gradient() is None)
+                    self.assertTrue(emb.weight.gradient() is None)
+                    self.assertTrue(input_emb.gradient() is None)
 
-                        input_emb.backward(backward_strategy)
-                        adam.minimize(input_emb)
-                        self.assertTrue(emb.weight.gradient() is not None)
+                    input_emb.backward()
+                    adam.minimize(input_emb)
+                    self.assertTrue(emb.weight.gradient() is not None)
 
-                        emb.clear_gradients()
-                        self.assertTrue(emb.weight.gradient() is None)
+                    emb.clear_gradients()
+                    self.assertTrue(emb.weight.gradient() is None)
 
-                        input_emb.clear_gradient()
-                        self.assertTrue(input_emb.gradient() is not None)
+                    input_emb.clear_gradient()
+                    self.assertTrue(input_emb.gradient() is not None)
+                    paddle.enable_static()
 
     def test_selectedrows_gradient2(self):
         places = [fluid.CPUPlace()]
@@ -83,8 +85,9 @@ class TestSimpleNet(unittest.TestCase):
         for place in places:
             for sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': sort_sum_gradient
+                    })
                     grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
@@ -100,7 +103,7 @@ class TestSimpleNet(unittest.TestCase):
                     self.assertTrue(emb.weight.gradient() is None)
                     self.assertTrue(input_emb.gradient() is None)
 
-                    input_emb.backward(backward_strategy)
+                    input_emb.backward()
                     adam.minimize(input_emb)
                     self.assertTrue(emb.weight.gradient() is not None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index a42a62019ba54a771d26ad853e39fcf8ca991180..3765cb784d6522cd0249a77045f8cbc841a2d9ac 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -119,8 +119,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_param_init = dict()
                     dy_loss = None
 
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
+                    })
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -135,7 +136,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
                         if i == 0:
                             for param in simple_net.parameters():
                                 dy_param_init[param.name] = param.numpy()
-                        dy_loss.backward(backward_strategy)
+                        dy_loss.backward()
                         sgd.minimize(dy_loss)
                         sgd.clear_gradients()
                         if i == batch_num - 1:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 649dc1ad91d3878dacc551fd08527885c3f479aa..d603a7d6ca0dea8df2e60207211f2061f1fe616d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -479,8 +479,7 @@ class DyGraphTrainModel(object):
 
         self.cfg = cfg
 
-        self.backward_strategy = fluid.dygraph.BackwardStrategy()
-        self.backward_strategy.sort_sum_gradient = cfg.sort_sum_gradient
+        fluid.set_flags({'FLAGS_sort_sum_gradient': cfg.sort_sum_gradient})
 
     def clear_gradients(self):
         if self.g_optimizer:
@@ -497,7 +496,7 @@ class DyGraphTrainModel(object):
         g_loss = get_generator_loss(image_real, label_org, label_trg,
                                     self.generator, self.discriminator,
                                     self.cfg)
-        g_loss.backward(self.backward_strategy)
+        g_loss.backward()
         if self.g_optimizer:
             self.g_optimizer.minimize(g_loss)
 
@@ -506,7 +505,7 @@ class DyGraphTrainModel(object):
         d_loss = get_discriminator_loss(image_real, label_org, label_trg,
                                         self.generator, self.discriminator,
                                         self.cfg)
-        d_loss.backward(self.backward_strategy)
+        d_loss.backward()
         if self.d_optimizer:
             self.d_optimizer.minimize(d_loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index afdab0148cbf078db6e183c7e6105fdb27d3266b..f10d2df7f06f98334df62d3021403d686054b7d9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -25,6 +25,8 @@ import paddle.fluid as fluid
 from paddle.fluid import core
 from test_imperative_base import new_program_scope
 
+LOADED_VAR_SUFFIX = ".load_0"
+
 
 def convolutional_neural_network(img):
     conv_pool_1 = fluid.nets.simple_img_conv_pool(
@@ -119,8 +121,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist = fluid.dygraph.static_runner.StaticModelRunner(
                 model_dir=self.save_dirname,
@@ -154,7 +155,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    avg_loss.backward(backward_strategy)
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
                     mnist.clear_gradients()
 
@@ -307,14 +308,14 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         self.assertTrue(np.array_equal(static_x_data, dy_x_data))
 
         for key, value in six.iteritems(static_param_init_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         # np.testing.assert_array_almost_equal(static_out, dy_out)
         self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04))
 
         for key, value in six.iteritems(static_param_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-4))
 
     def test_mnist_train_with_params_filename(self):
@@ -335,14 +336,14 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         self.assertTrue(np.array_equal(static_x_data, dy_x_data))
 
         for key, value in six.iteritems(static_param_init_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         # np.testing.assert_array_almost_equal(static_out, dy_out)
         self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04))
 
         for key, value in six.iteritems(static_param_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-4))
 
     def test_mnist_infer_no_params_filename(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index f501593d09d85a357113e075514c7d7cfbecaa2a..db47170c7bfff4575a9b4dcf694cd8ed722b0b8f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -27,6 +27,8 @@ from test_imperative_base import new_program_scope
 
 import paddle.fluid.transpiler.details.program_utils as pu
 
+LOADED_VAR_SUFFIX = ".load_0"
+
 
 def while_softmax_regression(img):
     def cond(i, times, pred):
@@ -109,9 +111,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
             np.random.seed(self.seed)
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             while_net = fluid.dygraph.static_runner.StaticModelRunner(
                 self.save_dirname)
@@ -139,7 +139,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
                 loss = fluid.layers.cross_entropy(cost, label)
                 avg_loss = fluid.layers.mean(loss)
 
-                avg_loss.backward(backward_strategy)
+                avg_loss.backward()
                 sgd.minimize(avg_loss)
                 while_net.clear_gradients()
 
@@ -219,13 +219,13 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
 
         # Phase 3. compare
         for key, value in six.iteritems(static_param_init_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 29cc718f14ff98de2b668d313d380d784cbaa6ef..c59ce44ec96a87383ec12998767af70ac07ff743 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -951,8 +951,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         with guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
@@ -1021,7 +1020,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
                     for param in transformer.parameters():
                         dy_param_init[param.name] = param.numpy()
 
-                dy_avg_cost.backward(backward_strategy)
+                dy_avg_cost.backward()
                 optimizer.minimize(dy_avg_cost)
                 transformer.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e329a37488a2cb8234532cd0a9beb7a1a25e72a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.static import InputSpec
+from paddle.fluid.framework import core, convert_np_dtype_to_dtype_
+
+
+class TestInputSpec(unittest.TestCase):
+    def test_default(self):
+        tensor_spec = InputSpec([3, 4])
+        self.assertEqual(tensor_spec.dtype,
+                         convert_np_dtype_to_dtype_('float32'))
+        self.assertEqual(tensor_spec.name, None)
+
+    def test_from_tensor(self):
+        x_bool = fluid.layers.fill_constant(shape=[1], dtype='bool', value=True)
+        bool_spec = InputSpec.from_tensor(x_bool)
+        self.assertEqual(bool_spec.dtype, x_bool.dtype)
+        self.assertEqual(bool_spec.shape, x_bool.shape)
+        self.assertEqual(bool_spec.name, x_bool.name)
+
+        bool_spec2 = InputSpec.from_tensor(x_bool, name='bool_spec')
+        self.assertEqual(bool_spec2.name, bool_spec2.name)
+
+    def test_from_numpy(self):
+        x_numpy = np.ones([10, 12])
+        x_np_spec = InputSpec.from_numpy(x_numpy)
+        self.assertEqual(x_np_spec.dtype,
+                         convert_np_dtype_to_dtype_(x_numpy.dtype))
+        self.assertEqual(x_np_spec.shape, x_numpy.shape)
+        self.assertEqual(x_np_spec.name, None)
+
+        x_numpy2 = np.array([1, 2, 3, 4]).astype('int64')
+        x_np_spec2 = InputSpec.from_numpy(x_numpy2, name='x_np_int64')
+        self.assertEqual(x_np_spec2.dtype,
+                         convert_np_dtype_to_dtype_(x_numpy2.dtype))
+        self.assertEqual(x_np_spec2.shape, x_numpy2.shape)
+        self.assertEqual(x_np_spec2.name, 'x_np_int64')
+
+    def test_shape_with_none(self):
+        tensor_spec = InputSpec([None, 4, None], dtype='int8', name='x_spec')
+        self.assertEqual(tensor_spec.dtype, convert_np_dtype_to_dtype_('int8'))
+        self.assertEqual(tensor_spec.name, 'x_spec')
+        self.assertEqual(tensor_spec.shape, (-1, 4, -1))
+
+    def test_shape_raise_error(self):
+        # 1. shape should only contain int and None.
+        with self.assertRaises(ValueError):
+            tensor_spec = InputSpec(['None', 4, None], dtype='int8')
+
+        # 2. shape should be type `list` or `tuple`
+        with self.assertRaises(TypeError):
+            tensor_spec = InputSpec(4, dtype='int8')
+
+        # 3. len(shape) should be greater than 0.
+        with self.assertRaises(ValueError):
+            tensor_spec = InputSpec([], dtype='int8')
+
+    def test_batch_and_unbatch(self):
+        tensor_spec = InputSpec([10])
+        # insert batch_size
+        batch_tensor_spec = tensor_spec.batch(16)
+        self.assertEqual(batch_tensor_spec.shape, (16, 10))
+
+        # unbatch
+        unbatch_spec = batch_tensor_spec.unbatch()
+        self.assertEqual(unbatch_spec.shape, (10, ))
+
+        # 1. `unbatch` requires len(shape) > 1
+        with self.assertRaises(ValueError):
+            unbatch_spec.unbatch()
+
+        # 2. `batch` requires len(batch_size) == 1
+        with self.assertRaises(ValueError):
+            tensor_spec.batch([16, 12])
+
+        # 3. `batch` requires type(batch_size) == int
+        with self.assertRaises(TypeError):
+            tensor_spec.batch('16')
+
+    def test_eq_and_hash(self):
+        tensor_spec_1 = InputSpec([10, 16], dtype='float32')
+        tensor_spec_2 = InputSpec([10, 16], dtype='float32')
+        tensor_spec_3 = InputSpec([10, 16], dtype='float32', name='x')
+        tensor_spec_4 = InputSpec([16], dtype='float32', name='x')
+
+        # override ``__eq__`` according to [shape, dtype, name]
+        self.assertTrue(tensor_spec_1 == tensor_spec_2)
+        self.assertTrue(tensor_spec_1 != tensor_spec_3)  # different name
+        self.assertTrue(tensor_spec_3 != tensor_spec_4)  # different shape
+
+        # override ``__hash__``  according to [shape, dtype]
+        self.assertTrue(hash(tensor_spec_1) == hash(tensor_spec_2))
+        self.assertTrue(hash(tensor_spec_1) == hash(tensor_spec_3))
+        self.assertTrue(hash(tensor_spec_3) != hash(tensor_spec_4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestInstanceNorm(unittest.TestCase):
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inverse_op.py b/python/paddle/fluid/tests/unittests/test_inverse_op.py
index 5349654ac27800d2e70c4b77f6531853178fd3ed..fd540dcd741eef4c007eae19a982bc186c09d7d7 100644
--- a/python/paddle/fluid/tests/unittests/test_inverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inverse_op.py
@@ -89,8 +89,7 @@ class TestInverseAPI(unittest.TestCase):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[4, 4], dtype="float64")
-            result = paddle.inverse(input=input)
-
+            result = paddle.inverse(x=input)
             input_np = np.random.random([4, 4]).astype("float64")
             result_np = np.linalg.inv(input_np)
 
@@ -145,7 +144,7 @@ class TestInverseSingularAPI(unittest.TestCase):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[4, 4], dtype="float64")
-            result = paddle.inverse(input=input)
+            result = paddle.inverse(x=input)
 
             input_np = np.zeros([4, 4]).astype("float64")
 
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a868e751f0567e6387b0e9471f0382c9456bcb6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+
+
+def run_static(x_np, dtype, op_str, use_gpu=False):
+    paddle.enable_static()
+    startup_program = fluid.Program()
+    main_program = fluid.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    with fluid.program_guard(main_program, startup_program):
+        x = paddle.data(name='x', shape=x_np.shape, dtype=dtype)
+        res = getattr(paddle.tensor, op_str)(x)
+        exe.run(startup_program)
+        static_result = exe.run(main_program,
+                                feed={'x': x_np},
+                                fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, op_str, use_gpu=True):
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x = paddle.to_variable(x_np)
+    dygraph_result = getattr(paddle.tensor, op_str)(x)
+    return dygraph_result
+
+
+def np_data_generator(low, high, np_shape, type, sv_list, op_str, *args,
+                      **kwargs):
+    x_np = np.random.uniform(low, high, np_shape).astype(getattr(np, type))
+    # x_np.shape[0] >= len(sv_list)
+    if type in ['float16', 'float32', 'float64']:
+        for i, v in enumerate(sv_list):
+            x_np[i] = v
+    ori_shape = x_np.shape
+    x_np = x_np.reshape((np.product(ori_shape), ))
+    np.random.shuffle(x_np)
+    x_np = x_np.reshape(ori_shape)
+    result_np = getattr(np, op_str)(x_np)
+    return x_np, result_np
+
+
+TEST_META_DATA = [
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [8, 17, 5, 6, 7],
+        'type': 'float16',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [11, 17],
+        'type': 'float32',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [2, 3, 4, 5],
+        'type': 'float64',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0,
+        'high': 100,
+        'np_shape': [11, 17, 10],
+        'type': 'int32',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0,
+        'high': 999,
+        'np_shape': [132],
+        'type': 'int64',
+        'sv_list': [np.inf, np.nan]
+    },
+]
+
+
+def test(test_case, op_str, use_gpu=False):
+    for meta_data in TEST_META_DATA:
+        meta_data = dict(meta_data)
+        meta_data['op_str'] = op_str
+        x_np, result_np = np_data_generator(**meta_data)
+        static_result = run_static(x_np, meta_data['type'], op_str, use_gpu)
+        dygraph_result = run_dygraph(x_np, op_str, use_gpu)
+        test_case.assertTrue((static_result == result_np).all())
+        test_case.assertTrue((dygraph_result.numpy() == result_np).all())
+
+
+class TestCPUNormal(unittest.TestCase):
+    def test_inf(self):
+        test(self, 'isinf')
+
+    def test_nan(self):
+        test(self, 'isnan')
+
+    def test_finite(self):
+        test(self, 'isfinite')
+
+
+class TestCUDANormal(unittest.TestCase):
+    def test_inf(self):
+        test(self, 'isinf', True)
+
+    def test_nan(self):
+        test(self, 'isnan', True)
+
+    def test_finite(self):
+        test(self, 'isfinite', True)
+
+
+class TestError(unittest.TestCase):
+    def test_bad_input(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+
+            def test_isinf_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isinf(x)
+
+            self.assertRaises(TypeError, test_isinf_bad_x)
+
+            def test_isnan_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isnan(x)
+
+            self.assertRaises(TypeError, test_isnan_bad_x)
+
+            def test_isfinite_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isfinite(x)
+
+            self.assertRaises(TypeError, test_isfinite_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index a61d31e88253d7b45efde6226fe14cf5b5b11af9..2b79659b9c6957aa6b141f8b36cb674497cb1392 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -15,32 +15,33 @@
 from __future__ import print_function
 
 import os
+import pickle
 import unittest
 import numpy as np
 
-import paddle
+from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
 
 BATCH_SIZE = 32
-BATCH_NUM = 20
+BATCH_NUM = 10
 SEED = 10
 
 
-def random_batch_reader():
-    def _get_random_images_and_labels(image_shape, label_shape):
+def random_batch_reader(input_size, label_size):
+    def _get_random_inputs_and_labels(input_size, label_size):
         np.random.seed(SEED)
-        image = np.random.random(size=image_shape).astype('float32')
-        label = np.random.random(size=label_shape).astype('int64')
-        return image, label
+        input = np.random.random(size=input_size).astype('float32')
+        label = np.random.random(size=label_size).astype('int64')
+        return input, label
 
     def __reader__():
         for _ in range(BATCH_NUM):
-            batch_image, batch_label = _get_random_images_and_labels(
-                [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-            yield batch_image, batch_label
+            batch_input, batch_label = _get_random_inputs_and_labels(
+                [BATCH_SIZE, input_size], [BATCH_SIZE, label_size])
+            yield batch_input, batch_label
 
     return __reader__
 
@@ -77,13 +78,14 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
         return z, loss
 
 
-def train(layer):
+def train(layer, input_size=784, label_size=1):
     # create optimizer
     adam = fluid.optimizer.SGDOptimizer(
         learning_rate=0.01, parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-    train_loader.set_batch_generator(random_batch_reader())
+    train_loader.set_batch_generator(
+        random_batch_reader(input_size, label_size))
     # train
     for data in train_loader():
         img, label = data
@@ -100,11 +102,6 @@ def train(layer):
     return [img], layer, avg_loss
 
 
-def infer(layer):
-    x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
-    return layer(x)
-
-
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
         self.model_path = "model.test_jit_save_load"
@@ -159,7 +156,7 @@ class TestJitSaveLoad(unittest.TestCase):
 
     def load_dygraph_state_dict(self, train_layer):
         train_layer.eval()
-        # contruct new model
+        # construct new model
         new_layer = LinearNet(784, 1)
         model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
         new_layer.set_dict(model_dict)
@@ -179,7 +176,7 @@ class TestJitSaveLoad(unittest.TestCase):
                 model_path=self.model_path,
                 input_spec=example_inputs)
 
-    def test_load_dygraoh_no_path(self):
+    def test_load_dygraph_no_path(self):
         model_path = "model.test_jit_save_load.no_path"
         new_layer = LinearNet(784, 1)
         with self.assertRaises(ValueError):
@@ -205,6 +202,92 @@ class TestJitSaveLoad(unittest.TestCase):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
 
+class LinearNetMultiInput(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        # self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            [None, 8], dtype='float32'), InputSpec(
+                [None, 8], dtype='float32')
+    ])
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear1(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
+class TestSaveLoadWithInputSpec(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+
+    def test_with_input_spec(self):
+        net = LinearNetReturnLoss(8, 8)
+        # set x.shape = [None, 8]
+        net.forward = declarative(
+            net.forward, input_spec=[InputSpec(
+                [None, 8], name='x')])
+
+        model_path = "model.input_spec.output_spec"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        # check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 1)
+        input_x = net.forward.inputs[0]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_x.name == 'x')
+
+        # 1. prune loss
+        configs.output_spec = net.forward.outputs[:1]
+        fluid.dygraph.jit.save(net, model_path, configs=configs)
+
+        # 2. load to infer
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        pred = infer_layer(x)
+
+    def test_multi_in_out(self):
+        net = LinearNetMultiInput(8, 8)
+
+        model_path = "model.multi_inout.output_spec1"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        # 1. check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 2)
+        input_x = net.forward.inputs[0]
+        input_y = net.forward.inputs[1]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_y.shape == (-1, 8))
+
+        # 2. prune loss
+        configs.output_spec = net.forward.outputs[:2]
+        fluid.dygraph.jit.save(net, model_path, configs=configs)
+
+        # 3. load to infer
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        y = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        # 4. predict
+        pred_x, pred_y = infer_layer(x, y)
+
+        # 1. prune y and loss
+        model_path = "model.multi_inout.output_spec2"
+        configs.output_spec = net.forward.outputs[:1]
+        fluid.dygraph.jit.save(net, model_path, [input_x], configs)
+        # 2. load again
+        infer_layer2 = fluid.dygraph.jit.load(model_path, configs=configs)
+        # 3. predict
+        pred_xx = infer_layer2(x)
+
+        # 4. assert pred_x == pred_xx
+        self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
+
+
 class TestJitSaveLoadConfig(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
@@ -279,5 +362,119 @@ class TestJitSaveLoadConfig(unittest.TestCase):
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
 
+class MultiLoadingLinearNet(fluid.dygraph.Layer):
+    def __init__(self, size, model_path):
+        super(MultiLoadingLinearNet, self).__init__()
+        self._linear = Linear(size, size)
+        self._load_linear1 = fluid.dygraph.jit.load(model_path)
+        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+
+    @declarative
+    def forward(self, x):
+        tmp1 = self._linear(x)
+        tmp2 = self._load_linear1(tmp1)
+        tmp3 = self._load_linear2(tmp2)
+        y = self._linear(tmp3)
+        return y
+
+
+class TestJitMultipleLoading(unittest.TestCase):
+    def setUp(self):
+        self.linear_size = 4
+        self.model_path = "model.jit_multi_load"
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        fluid.default_main_program().random_seed = SEED
+        # train and save base model
+        self.train_and_save_orig_model()
+
+    def train_and_save_orig_model(self):
+        layer = LinearNet(self.linear_size, self.linear_size)
+        example_inputs, layer, _ = train(layer, self.linear_size, 1)
+        fluid.dygraph.jit.save(
+            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+
+    def test_load_model_retransform_inference(self):
+        multi_loaded_layer = MultiLoadingLinearNet(self.linear_size,
+                                                   self.model_path)
+        state_dict = multi_loaded_layer.state_dict()
+        name_set = set()
+        for _, var in state_dict.items():
+            self.assertTrue(var.name not in name_set)
+            name_set.add(var.name)
+
+
+class LinearNetReturnHidden(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnHidden, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        loss = fluid.layers.mean(z)
+        return y, loss
+
+
+class TestJitPruneModelAndLoad(unittest.TestCase):
+    def setUp(self):
+        self.linear_size = 4
+        self.model_path = "model.jit_prune_model_and_load"
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        fluid.default_main_program().random_seed = SEED
+
+    def train_and_save(self):
+        train_layer = LinearNetReturnHidden(8, 8)
+        adam = fluid.optimizer.AdamOptimizer(
+            learning_rate=0.1, parameter_list=train_layer.parameters())
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        for i in range(10):
+            hidden, loss = train_layer(x)
+            loss.backward()
+            adam.minimize(loss)
+            train_layer.clear_gradients()
+
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.output_spec = [hidden]
+        fluid.dygraph.jit.save(
+            layer=train_layer,
+            model_path=self.model_path,
+            input_spec=[x],
+            configs=configs)
+
+        return train_layer
+
+    def test_load_pruned_model(self):
+        train_layer = self.train_and_save()
+        train_layer.eval()
+
+        infer_layer = fluid.dygraph.jit.load(self.model_path)
+
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
+
+    def test_load_var_not_in_extra_var_info(self):
+        self.train_and_save()
+
+        # chage extra var info
+        var_info_path = os.path.join(self.model_path, EXTRA_VAR_INFO_FILENAME)
+        with open(var_info_path, 'rb') as f:
+            extra_var_info = pickle.load(f)
+            extra_var_info.clear()
+        with open(var_info_path, 'wb') as f:
+            pickle.dump(extra_var_info, f, protocol=2)
+
+        with self.assertRaises(RuntimeError):
+            fluid.dygraph.jit.load(self.model_path)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index a19b4d9c13a9e646da405babfbac98f7ed15f217..8780727e4cb276a989a8d04d05c6419a4874e7f5 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -13,6 +13,7 @@
 
 from __future__ import division
 
+import paddle
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -77,5 +78,36 @@ class TestKLDivLossOp4(TestKLDivLossOp):
         self.reduction = 'sum'
 
 
+class TestKLDivLossDygraph(unittest.TestCase):
+    def run_kl_loss(self, reduction, shape=(5, 20)):
+        x = np.random.uniform(-10, 10, shape).astype('float64')
+        target = np.random.uniform(-10, 10, shape).astype('float64')
+        gt_loss = kldiv_loss(x, target, reduction)
+
+        with paddle.fluid.dygraph.guard():
+            kldiv_criterion = paddle.nn.KLDivLoss(reduction)
+            pred_loss = kldiv_criterion(
+                paddle.to_variable(x), paddle.to_variable(target))
+            self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
+
+    def test_kl_loss_batchmean(self):
+        self.run_kl_loss('batchmean')
+
+    def test_kl_loss_mean(self):
+        self.run_kl_loss('mean')
+
+    def test_kl_loss_sum(self):
+        self.run_kl_loss('sum')
+
+    def test_kl_loss_none(self):
+        self.run_kl_loss('none')
+
+    def test_kl_loss_static_api(self):
+        input = paddle.fluid.data(name='input', shape=[5, 20])
+        label = paddle.fluid.data(name='label', shape=[5, 20])
+
+        pred_loss = paddle.nn.functional.kl_div(input, label)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index d7e801a666f110471180d704a6fda6dc0f9aeb1e..6a15fe494779f93c2f36a301594aaccf55283902 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -20,111 +20,165 @@ import numpy as np
 import unittest
 
 
-class TestL1Loss(unittest.TestCase):
-    def test_L1Loss_mean(self):
-        input_np = np.random.random(size=(10, 1)).astype(np.float32)
-        label_np = np.random.random(size=(10, 1)).astype(np.float32)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.layers.data(
-                name='input', shape=[10, 1], dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[10, 1], dtype='float32')
-            l1_loss = paddle.nn.loss.L1Loss()
-            ret = l1_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[ret])
-
-        with fluid.dygraph.guard():
-            l1_loss = paddle.nn.loss.L1Loss()
-            dy_ret = l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_ret.numpy()
-
-        expected = np.mean(np.abs(input_np - label_np))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+class TestFunctionalL1Loss(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+        self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_variable(self.input_np)
+        label = paddle.to_variable(self.label_np)
+        dy_result = paddle.nn.functional.l1_loss(input, label)
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
-    def test_L1Loss_sum(self):
-        input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
-        label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.layers.data(
+        dy_result = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.l1_loss(input, label, reduction='none')
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [10, 10, 5])
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.data(name='label', shape=[10, 10, 5], dtype='float32')
+        result0 = paddle.nn.functional.l1_loss(input, label)
+        result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+        result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
+        y = paddle.nn.functional.l1_loss(input, label, name='aaa')
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(static_result[2], expected))
+
+        self.assertTrue('aaa' in y.name)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+    # test case the raise message
+    def test_errors(self):
+        def test_value_error():
+            input = paddle.data(
                 name='input', shape=[10, 10, 5], dtype='float32')
-            label = fluid.layers.data(
+            label = paddle.data(
                 name='label', shape=[10, 10, 5], dtype='float32')
-            l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
-            ret = l1_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[ret])
-
-        with fluid.dygraph.guard():
-            l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
-            dy_ret = l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_ret.numpy()
-
-        expected = np.sum(np.abs(input_np - label_np))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+            loss = paddle.nn.functional.l1_loss(
+                input, label, reduction='reduce_mean')
+
+        self.assertRaises(ValueError, test_value_error)
+
+
+class TestClassL1Loss(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+        self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_variable(self.input_np)
+        label = paddle.to_variable(self.label_np)
+        l1_loss = paddle.nn.loss.L1Loss()
+        dy_result = l1_loss(input, label)
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
+        dy_result = l1_loss(input, label)
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
-    def test_L1Loss_none(self):
-        input_np = np.random.random(size=(10, 5)).astype(np.float32)
-        label_np = np.random.random(size=(10, 5)).astype(np.float32)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.layers.data(
-                name='input', shape=[10, 5], dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[10, 5], dtype='float32')
-            l1_loss = paddle.nn.loss.L1Loss(reduction='none')
-            ret = l1_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[ret])
-
-        with fluid.dygraph.guard():
-            l1_loss = paddle.nn.loss.L1Loss(reduction='none')
-            dy_ret = l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_ret.numpy()
-
-        expected = np.abs(input_np - label_np)
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
-        self.assertTrue(dy_result.shape, input.shape)
+        l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+        dy_result = l1_loss(input, label)
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [10, 10, 5])
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.data(name='label', shape=[10, 10, 5], dtype='float32')
+        l1_loss = paddle.nn.loss.L1Loss()
+        result0 = l1_loss(input, label)
+        l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
+        result1 = l1_loss(input, label)
+        l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+        result2 = l1_loss(input, label)
+        l1_loss = paddle.nn.loss.L1Loss(name='aaa')
+        result3 = l1_loss(input, label)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(static_result[2], expected))
+        self.assertTrue('aaa' in result3.name)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+    # test case the raise message
+    def test_errors(self):
+        def test_value_error():
+            loss = paddle.nn.loss.L1Loss(reduction="reduce_mean")
+
+        self.assertRaises(ValueError, test_value_error)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f324e4bd377c616fb14b2b6df2b936b04ed76ff5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphLayerNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 9da70e85f01c0a13a87766a1befbda206c510cbe..1992a3bb39807a62966e245d24888cc074746e8d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -283,6 +283,24 @@ class TestLayer(LayerTest):
             with self.assertRaises(ValueError):
                 lm(base.to_variable(inp))
 
+    def test_SyncBatchNorm(self):
+        if core.is_compiled_with_cuda():
+            with self.static_graph():
+                t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
+                my_sync_bn = paddle.nn.SyncBatchNorm(3)
+                ret = my_sync_bn(t)
+                static_ret = self.get_static_graph_result(
+                    feed={'t': np.ones(
+                        [3, 3, 5, 5], dtype='float32')},
+                    fetch_list=[ret])[0]
+
+            with self.dynamic_graph():
+                t = np.ones([3, 3, 5, 5], dtype='float32')
+                my_syncbn = paddle.nn.SyncBatchNorm(3)
+                dy_ret = my_syncbn(base.to_variable(t))
+                dy_ret_value = dy_ret.numpy()
+            self.assertTrue(np.array_equal(static_ret, static_ret))
+
     def test_relu(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -298,21 +316,6 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
 
-    def test_leakyrelu(self):
-        inputs = np.random.uniform(-1, 1, (10, 10)).astype('float32')
-        with self.static_graph():
-            t = layers.data(name='t', shape=[10, 10], dtype='float32')
-            ret = layers.leaky_relu(t, alpha=0.01)
-            static_ret = self.get_static_graph_result(
-                feed={'t': inputs}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            lrelu = paddle.nn.LeakyReLU(alpha=0.01)
-            dy_ret = lrelu(base.to_variable(inputs))
-            dy_ret_value = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-
     def test_pad2d(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
@@ -2660,13 +2663,6 @@ class TestBook(LayerTest):
             out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
             return (out)
 
-    def make_leaky_relu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.leaky_relu(input, alpha=0.1, name='leaky_relu')
-            return (out)
-
     def make_soft_relu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -3686,5 +3682,32 @@ class TestBook(LayerTest):
                         batch_first=batch_first)
 
 
+class TestMetricsDetectionMap(unittest.TestCase):
+    def test_detection_map(self):
+        program = fluid.Program()
+        with program_guard(program):
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 1],
+                append_batch_size=False,
+                dtype='float32')
+            box = fluid.layers.data(
+                name='bbox',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            map_eval = fluid.metrics.DetectionMAP(
+                detect_res, label, box, class_num=21)
+            cur_map, accm_map = map_eval.get_map_var()
+            self.assertIsNotNone(cur_map)
+            self.assertIsNotNone(accm_map)
+        print(str(program))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 71b452d4a2dd192c756599eb24949084bfa0860e..36368a83893c7eea3e5842638b3fc677e1a1b936 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -19,6 +19,7 @@ import math
 import numpy as np
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
@@ -522,111 +523,5 @@ class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
         run_places(lr, start_lr, end_lr)
 
 
-def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
-                         var_list):
-    def is_better(current, best, m, n):
-        if m == 'min' and n == 'rel':
-            return current < best - best * threshold
-        elif m == 'min' and n == 'abs':
-            return current < best - threshold
-        elif m == 'max' and n == 'rel':
-            return current > best + best * threshold
-        else:  # mode == 'max' and epsilon_mode == 'abs':
-            return current > best + threshold
-
-    if var_list[2] > 0:
-        var_list[2] -= 1
-        return var_list[1]
-
-    if is_better(loss, var_list[0], m, n):
-        var_list[0] = loss
-        var_list[3] = 0
-    else:
-        var_list[3] += 1
-        if var_list[3] > patience:
-            var_list[2] = cooldown
-            var_list[3] = 0
-            new_lr = var_list[1] * decay_rate
-            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]
-
-    return var_list[1]
-
-
-class TestReduceLROnPlateauDecay(unittest.TestCase):
-    def test_dygraph_mode(self):
-        with fluid.dygraph.guard():
-            # the decay rate must be less than 1.0
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(
-                    learning_rate=1.0, decay_rate=2.0)
-            # the mode must be "min" or "max"
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(learning_rate=1.0, mode="test")
-            # the threshold_mode must be "rel" or "abs"
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(
-                    learning_rate=1.0, threshold_mode="test")
-
-            base_lr = 1.0
-            patience = 3
-            cooldown = 1
-            decay_rate = 0.5
-            threshold = 1e-4
-            linear = fluid.dygraph.Linear(10, 10)
-
-            for m, n in zip(['min', 'max', 'min', 'max'],
-                            ['rel', 'rel', 'abs', 'abs']):
-                kwargs = {
-                    'learning_rate': base_lr,
-                    'decay_rate': decay_rate,
-                    'threshold': threshold,
-                    'verbose': True,
-                    'patience': patience,
-                    'cooldown': cooldown,
-                    'mode': m,
-                    'threshold_mode': n,
-                    'eps': 1e-6
-                }
-                print("class=" + fluid.dygraph.ReduceLROnPlateau.__name__ +
-                      " kwargs=" + str(kwargs))
-                lr = fluid.dygraph.ReduceLROnPlateau(**kwargs)
-                sgd = fluid.optimizer.SGD(learning_rate=lr,
-                                          parameter_list=linear.parameters())
-
-                best = float("-10000") if m == "max" else float("10000")
-                expected_lr = 1.0
-                cooldown_counter = 0
-                num_bad_epochs = 0
-                var_list = [best, expected_lr, cooldown_counter, num_bad_epochs]
-                step_num = 0
-                epoch_num = 0
-                for epoch in range(30):
-                    total_loss = 0
-
-                    for batch_id in range(2):
-                        step_num += 1
-                        x = fluid.dygraph.to_variable(
-                            np.array([step_num]).astype('float32'))
-                        loss = layers.sin(x)
-                        sgd.minimize(loss)
-                        total_loss += loss
-
-                    epoch_num += 1
-                    # get expected lr from fluid
-                    avg_loss = total_loss / 1
-                    lr.step(avg_loss)
-                    actual_lr = lr().numpy()[0]
-
-                    # get expected lr form python
-                    expected_lr = reduce_lr_on_plateau(decay_rate, threshold,
-                                                       cooldown, patience, m, n,
-                                                       avg_loss, var_list)
-                    self.assertEqual(
-                        expected_lr,
-                        actual_lr,
-                        msg='Failed reduce lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
-                        format(epoch_num, expected_lr, actual_lr))
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d07a80da15dbfd35ffdedbcb09e82d59a84486e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+
+
+class LinearTestCase(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32'
+        self.input = np.ones((3, 1, 2)).astype(self.dtype)
+        self.weight = np.ones((2, 2)).astype(self.dtype)
+        self.bias = np.ones((2)).astype(self.dtype)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+
+    def functional(self, place):
+        paddle.disable_static(place)
+        input = paddle.to_tensor(self.input)
+        weight = paddle.to_tensor(self.weight)
+        bias = paddle.to_tensor(self.bias)
+        out = F.linear(input, weight, bias)
+        return out.numpy()
+
+    def paddle_nn_layer(self, place):
+        paddle.disable_static(place)
+        input = paddle.to_tensor(self.input)
+        weight_attr = fluid.ParamAttr(
+            name="linear_weight",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+        bias_attr = fluid.ParamAttr(
+            name="linear_bias",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+        linear = paddle.nn.Linear(
+            2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+        y = linear(input)
+        return y.numpy()
+
+    def numpy_cal(self):
+        res = np.matmul(self.input, self.weight) + self.bias
+        return res
+
+    def test_error(self, place=paddle.CPUPlace()):
+        res_f = self.functional(place)
+        res_nn = self.paddle_nn_layer(place)
+        res_np = self.numpy_cal()
+        np.testing.assert_array_almost_equal(res_f, res_nn)
+        np.testing.assert_array_almost_equal(res_nn, res_np)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index 98f7cd5b6b2dc8c82a71edf7ec36a24921726e3c..53e8b02081ae3acf8a7fb5dd2bc6e05cbc3be901 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
-from paddle.nn.functional import *
+from paddle.nn.functional import interpolate
 
 
 def linear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..04b56677fc158583fe79ec0dc1276210bd2ebbdc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -0,0 +1,438 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import platform
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from paddle.nn.functional import interpolate
+
+
+def linear_interp_np(input,
+                     out_w,
+                     out_size=None,
+                     actual_shape=None,
+                     align_corners=True,
+                     align_mode=0,
+                     data_layout='NCHW'):
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 2, 1))  # NHWC => NCHW
+    if out_size is not None:
+        out_w = out_size[0]
+    if actual_shape is not None:
+        out_w = actual_shape[0]
+    batch_size, channel, in_w = input.shape
+
+    ratio_w = 0.0
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_w))
+
+    for j in range(out_w):
+        if (align_mode == 0 and not align_corners):
+            w = int(ratio_w * (j + 0.5) - 0.5)
+        else:
+            w = int(ratio_w * j)
+        w = max(0, w)
+        wid = 1 if w < in_w - 1 else 0
+
+        if (align_mode == 0 and not align_corners):
+            idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+            w1lambda = idx_src_w - w
+        else:
+            w1lambda = ratio_w * j - w
+        w2lambda = 1.0 - w1lambda
+
+        out[:, :, j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w +
+                                                                    wid]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestLinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_w = self.input_shape[2]
+        else:
+            in_w = self.input_shape[1]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(in_w * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [float(self.scale)]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        if platform.system() == "Linux":
+            self.check_output(atol=1e-7)
+        else:
+            self.check_output(atol=1e-5)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestLinearInterpOpDataLayout(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 1
+        self.data_layout = 'NHWC'
+
+
+class TestLinearInterpOpAlignMode(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestLinearInterpOpScale(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.5
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+
+        if self.data_layout == "NCHW":
+            in_w = self.input_shape[2]
+        else:
+            in_w = self.input_shape[1]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(in_w * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode, self.data_layout)
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None and self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.actual_shape is not None and self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.actual_shape
+        else:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+
+class TestResizeLinearAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[1, 3, 64], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[1], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[1], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_linear(
+            x, out_shape=[128, ], align_mode=1, align_corners=False)
+        out2 = fluid.layers.resize_linear(
+            x, out_shape=[128], align_mode=1, align_corners=False)
+        out3 = fluid.layers.resize_linear(
+            x, out_shape=shape_tensor, align_mode=1, align_corners=False)
+        out4 = fluid.layers.resize_linear(
+            x,
+            out_shape=[128, ],
+            actual_shape=actual_size,
+            align_mode=1,
+            align_corners=False)
+        out5 = fluid.layers.resize_linear(
+            x, scale=scale_tensor, align_mode=1, align_corners=False)
+
+        out6 = interpolate(
+            x,
+            scale_factor=scale_tensor,
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        out7 = interpolate(
+            x,
+            size=[128, ],
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        out8 = interpolate(
+            x,
+            size=shape_tensor,
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+
+        x_data = np.random.random((1, 3, 64)).astype("float32")
+        dim_data = np.array([128]).astype("int32")
+        shape_data = np.array([128, ]).astype("int32")
+        actual_size_data = np.array([128, ]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "x": x_data,
+                "dim": dim_data,
+                "shape_tensor": shape_data,
+                "actual_size": actual_size_data,
+                "scale_tensor": scale_data
+            },
+            fetch_list=[out1, out2, out3, out4, out5, out6, out7, out8],
+            return_numpy=True)
+
+        expect_res = linear_interp_np(
+            x_data, out_w=128, align_mode=1, align_corners=False)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestLinearInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph 
+        x_data = np.random.random((1, 3, 128)).astype("float32")
+        us_1 = paddle.nn.UpSample(
+            size=[64, ],
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = us_1(x)
+
+            expect = linear_interp_np(
+                x_data, out_w=64, align_mode=1, align_corners=False)
+
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestResizeLinearOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("uint8")
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(self.input_shape[2] * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        if platform.system() == "Linux":
+            self.check_output_with_place(place=core.CPUPlace(), atol=1e-7)
+        else:
+            self.check_output_with_place(place=core.CPUPlace(), atol=1e-5)
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [2, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestLinearInterpOpException(unittest.TestCase):
+    def test_exception(self):
+        def input_shape_error():
+            x1 = fluid.data(name="x1", shape=[1], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x1, out_shape=[256, ], data_format='NCW')
+
+        def data_format_error():
+            x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x2, out_shape=[256, ], data_format='NHWCD')
+
+        def out_shape_error():
+            x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x3, out_shape=[
+                    256,
+                    256,
+                ], data_format='NHWC')
+
+        self.assertRaises(ValueError, input_shape_error)
+        self.assertRaises(ValueError, data_format_error)
+        self.assertRaises(ValueError, out_shape_error)
+
+
+class TestLinearInterpOpError(unittest.TestCase):
+    def test_error(self):
+        with program_guard(Program(), Program()):
+
+            def input_shape_error():
+                x1 = fluid.data(name="x1", shape=[1], dtype="float32")
+                out1 = paddle.nn.UpSample(
+                    size=[256, ], data_format='NCW', mode='linear')
+                out1_res = out1(x1)
+
+            def data_format_error():
+                x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
+                out2 = paddle.nn.UpSample(
+                    size=[256, ], data_format='NHWCD', mode='linear')
+                out2_res = out2(x2)
+
+            def out_shape_error():
+                x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
+                out3 = paddle.nn.UpSample(
+                    size=[
+                        256,
+                        256,
+                    ], data_format='NHWC', mode='linear')
+                out3_res = out3(x3)
+
+            self.assertRaises(ValueError, input_shape_error)
+            self.assertRaises(ValueError, data_format_error)
+            self.assertRaises(ValueError, out_shape_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index c7bab1a135bc439eefa822087869e08a43de0c51..6d1f42111eebff0f469317ddf2a9ec7698a7ae1e 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -32,6 +32,7 @@ class TestLinspaceOpCommonCase(OpTest):
             'Stop': np.array([10]).astype(dtype),
             'Num': np.array([11]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
 
@@ -48,6 +49,7 @@ class TestLinspaceOpReverseCase(OpTest):
             'Stop': np.array([0]).astype(dtype),
             'Num': np.array([11]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
 
@@ -64,6 +66,7 @@ class TestLinspaceOpNumOneCase(OpTest):
             'Stop': np.array([0]).astype(dtype),
             'Num': np.array([1]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.array(10, dtype=dtype)}
 
@@ -72,6 +75,26 @@ class TestLinspaceOpNumOneCase(OpTest):
 
 
 class TestLinspaceAPI(unittest.TestCase):
+    def test_variable_input1(self):
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        out = paddle.linspace(start, stop, num, dtype='float32')
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res = exe.run(fluid.default_main_program(), fetch_list=[out])
+        np_res = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((res == np_res).all(), True)
+
+    def test_variable_input2(self):
+        paddle.disable_static()
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        out = paddle.linspace(start, stop, num, dtype='float32')
+        np_res = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((out.numpy() == np_res).all(), True)
+        paddle.enable_static()
+
     def test_dtype(self):
         out_1 = paddle.linspace(0, 10, 5, dtype='float32')
         out_2 = paddle.linspace(0, 10, 5, dtype=np.float32)
@@ -82,16 +105,23 @@ class TestLinspaceAPI(unittest.TestCase):
         assert np.array_equal(res_1, res_2)
 
     def test_name(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             out = paddle.linspace(
                 0, 10, 5, dtype='float32', name='linspace_res')
             assert 'linspace_res' in out.name
 
     def test_imperative(self):
-        with paddle.imperative.guard():
-            out = paddle.linspace(0, 10, 5, dtype='float32')
-            np_out = np.linspace(0, 10, 5, dtype='float32')
-        self.assertEqual((out.numpy() == np_out).all(), True)
+        paddle.disable_static()
+        out1 = paddle.linspace(0, 10, 5, dtype='float32')
+        np_out1 = np.linspace(0, 10, 5, dtype='float32')
+        out2 = paddle.linspace(0, 10, 5, dtype='int32')
+        np_out2 = np.linspace(0, 10, 5, dtype='int32')
+        out3 = paddle.linspace(0, 10, 200, dtype='int32')
+        np_out3 = np.linspace(0, 10, 200, dtype='int32')
+        paddle.enable_static()
+        self.assertEqual((out1.numpy() == np_out1).all(), True)
+        self.assertEqual((out2.numpy() == np_out2).all(), True)
+        self.assertEqual((out3.numpy() == np_out3).all(), True)
 
 
 class TestLinspaceOpError(unittest.TestCase):
@@ -99,7 +129,12 @@ class TestLinspaceOpError(unittest.TestCase):
         with program_guard(Program(), Program()):
 
             def test_dtype():
-                fluid.layers.linspace(0, 10, 1, dtype="int32")
+                fluid.layers.linspace(0, 10, 1, dtype="int8")
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_dtype():
+                fluid.layers.linspace(0, 10, 1.33, dtype="int32")
 
             self.assertRaises(TypeError, test_dtype)
 
@@ -119,20 +154,20 @@ class TestLinspaceOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_step_dtype)
 
             def test_start_dtype():
-                start = fluid.data(shape=[1], type="int32", name="start")
+                start = fluid.data(shape=[1], dtype="int32", name="start")
                 fluid.layers.linspace(start, 10, 1, dtype="float32")
 
             self.assertRaises(TypeError, test_start_dtype)
 
             def test_end_dtype():
-                end = fluid.data(shape=[1], type="int32", name="end")
+                end = fluid.data(shape=[1], dtype="int32", name="end")
                 fluid.layers.linspace(0, end, 1, dtype="float32")
 
             self.assertRaises(TypeError, test_end_dtype)
 
-            def test_step_dtype():
-                step = fluid.data(shape=[1], type="int32", name="step")
-                fluid.layers.linspace(0, 10, step, dtype="float32")
+            def test_num_dtype():
+                num = fluid.data(shape=[1], dtype="int32", name="step")
+                fluid.layers.linspace(0, 10, num, dtype="float32")
 
             self.assertRaises(TypeError, test_step_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 2b77624734d335bd999754b378971bcc5c945fa5..e3d7003ecedb60f9b4f9a542ed08ca88d894d24a 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,93 +14,136 @@
 
 import unittest
 import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.nn as nn
-import paddle.nn.functional as functional
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
 
+np.random.seed(10)
 
-def stable_softmax(x):
+
+def ref_log_softmax(x):
     shiftx = (x - np.max(x))
-    exps = np.exp(shiftx)
-    return exps / np.sum(exps)
+    out = shiftx - np.log(np.exp(shiftx).sum())
+    return out
 
 
-def ref_log_softmax(x, axis=None, dtype=None):
-    x_t = x.copy()
-    if dtype is not None:
-        x_t = x_t.astype(dtype)
-    if axis is None:
-        axis = -1
-    out = np.apply_along_axis(stable_softmax, axis, x_t)
-    return np.log(out)
+def ref_log_softmax_grad(x, axis):
+    if axis < 0:
+        axis += len(x.shape)
+    out = np.apply_along_axis(ref_log_softmax, axis, x)
+    axis_dim = x.shape[axis]
+    dout = np.full_like(x, fill_value=1. / x.size)
+    dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat(
+        axis_dim, axis=axis)
+    return dx
 
 
-class TestNNLogSoftmaxAPI(unittest.TestCase):
+class TestLogSoftmaxOp(OpTest):
     def setUp(self):
-        self.init_data()
+        self.op_type = 'log_softmax'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
 
-    def init_data(self):
-        self.x_shape = [2, 3, 4, 5]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
+class TestLogSoftmaxShape(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [12, 10]
 
-    def check_api(self, place=fluid.CPUPlace(), axis=None):
-        ref_out = ref_log_softmax(self.x, axis)
 
-        main_program = fluid.Program()
-        mylogsoftmax = nn.LogSoftmax(axis)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = mylogsoftmax(x)
-        exe = fluid.Executor(place)
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 1
+
+
+class TestNNLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def check_api(self, axis=-1):
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x)
+
+        logsoftmax = paddle.nn.LogSoftmax(axis)
+        # test static api
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data(name='x', shape=self.x_shape)
+            y = logsoftmax(x)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
         self.assertTrue(np.allclose(out[0], ref_out))
 
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = mylogsoftmax(x)
+        # test dygrapg api
+        paddle.disable_static()
+        x = paddle.to_variable(self.x)
+        y = logsoftmax(x)
         self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.enable_static()
 
     def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for axis in [None, 2]:
-                self.check_api(place, axis)
+        for axis in [-1, 1]:
+            self.check_api(axis)
 
 
 class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
     def setUp(self):
-        self.init_data()
-
-    def init_data(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-
-    def check_api(self, place=fluid.CPUPlace(), axis=None, dtype=None):
-        ref_out = ref_log_softmax(self.x, axis, dtype)
-        main_program = fluid.Program()
-        mylogsoftmax = nn.LogSoftmax(axis)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.log_softmax(x, axis, dtype)
-        exe = fluid.Executor(place)
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+        self.place = paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def check_api(self, axis=-1, dtype=None):
+        x = self.x.copy()
+        if dtype is not None:
+            x = x.astype(dtype)
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data(name='x', shape=self.x_shape)
+            y = F.log_softmax(x, axis, dtype)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
         self.assertTrue(np.allclose(out[0], ref_out))
 
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = functional.log_softmax(x, axis, dtype)
-        self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.disable_static()
+        x = paddle.to_variable(self.x)
+        y = F.log_softmax(x, axis, dtype)
+        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
+        paddle.enable_static()
 
     def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self.check_api(place, None, None)
-            self.check_api(place, None, np.float64)
+        for axis in [-1, 1]:
+            self.check_api(axis)
+        self.check_api(-1, 'float64')
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data(name='X1', shape=[100], dtype='int32')
+            self.assertRaises(TypeError, F.log_softmax, x)
+
+            x = paddle.data(name='X2', shape=[100], dtype='float32')
+            self.assertRaises(TypeError, F.log_softmax, x, dtype='int32')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
old mode 100644
new mode 100755
index 8f0049a8d30d0e1fed1d27cf6e13c036e33678d0..c8bb8c5b73f7680fc8a329656ef2b899f14d96ea
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -17,51 +17,235 @@ from __future__ import print_function
 import op_test
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from paddle.static import Program, program_guard
 
+TEST_META_OP_DATA = [{
+    'op_str': 'logical_and',
+    'binary_op': True
+}, {
+    'op_str': 'logical_or',
+    'binary_op': True
+}, {
+    'op_str': 'logical_xor',
+    'binary_op': True
+}, {
+    'op_str': 'logical_not',
+    'binary_op': False
+}]
 
-def create_test_class(op_type, callback, binary_op=True):
-    class Cls(op_test.OpTest):
-        def setUp(self):
-            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-            if binary_op:
-                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-                c = callback(a, b)
-            else:
-                c = callback(a)
-            self.outputs = {'Out': c}
-            self.op_type = op_type
-            if binary_op:
-                self.inputs = {'X': a, 'Y': b}
+TEST_META_SHAPE_DATA = {
+    'XDimLargerThanYDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 5]
+    },
+    'XDimLargerThanYDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 1]
+    },
+    'XDimLargerThanYDim3': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [1, 4, 1]
+    },
+    'XDimLargerThanYDim4': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4, 1]
+    },
+    'XDimLargerThanYDim5': {
+        'x_shape': [2, 3, 1, 5],
+        'y_shape': [3, 1, 1]
+    },
+    'XDimLessThanYDim1': {
+        'x_shape': [4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim2': {
+        'x_shape': [1, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim3': {
+        'x_shape': [3, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim4': {
+        'x_shape': [3, 1, 1],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'XDimLessThanYDim5': {
+        'x_shape': [4, 5],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'Axis1InLargerDim': {
+        'x_shape': [1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'EqualDim1': {
+        'x_shape': [10, 7],
+        'y_shape': [10, 7]
+    },
+    'EqualDim2': {
+        'x_shape': [1, 1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    }
+}
+
+TEST_META_WRONG_SHAPE_DATA = {
+    'ErrorDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4]
+    },
+    'ErrorDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 3]
+    }
+}
+
+
+def run_static(x_np, y_np, op_str, use_gpu=False, binary_op=True):
+    paddle.enable_static()
+    startup_program = fluid.Program()
+    main_program = fluid.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    with fluid.program_guard(main_program, startup_program):
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype='bool')
+        op = getattr(paddle, op_str)
+        feed_list = {'x': x_np}
+        if not binary_op:
+            res = op(x)
+        else:
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype='bool')
+            feed_list['y'] = y_np
+            res = op(x, y)
+        exe.run(startup_program)
+        static_result = exe.run(main_program, feed=feed_list, fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True):
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    op = getattr(paddle, op_str)
+    x = paddle.to_tensor(x_np)
+    if not binary_op:
+        dygraph_result = op(x)
+    else:
+        y = paddle.to_tensor(y_np)
+        dygraph_result = op(x, y)
+    return dygraph_result
+
+
+def np_data_generator(np_shape, *args, **kwargs):
+    return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+
+
+def test(unit_test, use_gpu=False, test_error=False):
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        meta_data['use_gpu'] = use_gpu
+        np_op = getattr(np, meta_data['op_str'])
+        META_DATA = dict(TEST_META_SHAPE_DATA)
+        if test_error:
+            META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
+        for shape_data in META_DATA.values():
+            meta_data['x_np'] = np_data_generator(shape_data['x_shape'])
+            meta_data['y_np'] = np_data_generator(shape_data['y_shape'])
+            if meta_data['binary_op'] and test_error:
+                # catch C++ Exception
+                unit_test.assertRaises(BaseException, run_static, **meta_data)
+                unit_test.assertRaises(BaseException, run_dygraph, **meta_data)
+                continue
+            static_result = run_static(**meta_data)
+            dygraph_result = run_dygraph(**meta_data)
+            if meta_data['binary_op']:
+                np_result = np_op(meta_data['x_np'], meta_data['y_np'])
             else:
-                self.inputs = {'X': a}
-
-        def test_output(self):
-            self.check_output()
-
-        def test_error(self):
-            with program_guard(Program(), Program()):
-                x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-                y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-                a = fluid.layers.data(name='a', shape=[2], dtype='int32')
-                op = eval("fluid.layers.%s" % self.op_type)
-                if self.op_type != "logical_not":
-                    self.assertRaises(TypeError, op, x=x, y=y, out=1)
-                    self.assertRaises(TypeError, op, x=x, y=a)
-                    self.assertRaises(TypeError, op, x=a, y=y)
-                else:
-                    self.assertRaises(TypeError, op, x=x, out=1)
-                    self.assertRaises(TypeError, op, x=a)
-
-    Cls.__name__ = op_type
-    globals()[op_type] = Cls
-
-
-create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
-create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
-create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
-create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
+                np_result = np_op(meta_data['x_np'])
+            unit_test.assertTrue((static_result == np_result).all())
+            unit_test.assertTrue((dygraph_result.numpy() == np_result).all())
+
+
+def test_type_error(unit_test, use_gpu, type_str_map):
+    def check_type(op_str, x, y, binary_op):
+        op = getattr(paddle, op_str)
+        error_type = TypeError
+        if isinstance(x, np.ndarray):
+            x = paddle.to_tensor(x)
+            y = paddle.to_tensor(y)
+            error_type = BaseException
+        if binary_op:
+            if type_str_map['x'] != 'bool' or type_str_map['y'] != 'bool':
+                unit_test.assertRaises(error_type, op, x=x, y=y)
+            if not fluid.in_dygraph_mode():
+                unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
+        else:
+            if type_str_map['x'] != 'bool':
+                unit_test.assertRaises(error_type, op, x=x)
+            if not fluid.in_dygraph_mode():
+                unit_test.assertRaises(error_type, op, x=x, out=1)
+
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        binary_op = meta_data['binary_op']
+
+        paddle.disable_static(place)
+        x = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['x'])
+        y = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['y'])
+        check_type(meta_data['op_str'], x, y, binary_op)
+
+        paddle.enable_static()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                name='x', shape=[10], dtype=type_str_map['x'])
+            y = paddle.static.data(
+                name='y', shape=[10], dtype=type_str_map['y'])
+            check_type(meta_data['op_str'], x, y, binary_op)
+
+
+def type_map_factory():
+    x_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
+    y_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
+    return [{
+        'x': x_type,
+        'y': y_type
+    } for x_type in x_type_list for y_type in y_type_list]
+
+
+class TestCPU(unittest.TestCase):
+    def test(self):
+        test(self)
+
+    def test_error(self):
+        test(self, False, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, False, type_map)
+
+
+class TestCUDA(unittest.TestCase):
+    def test(self):
+        test(self, True)
+
+    def test_error(self):
+        test(self, True, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, True, type_map)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index 508b4a7b72da8affbc7ddf590b8142a41d1f3191..c2201a52605bc87246fb9c8734494b19f83ff180 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -12,64 +12,128 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import paddle
-import paddle.fluid as fluid
 import unittest
 import numpy as np
 from op_test import OpTest
-from paddle.fluid import Program, program_guard
-from paddle.fluid.layer_helper import LayerHelper
 
 
-class TestLogSumOpError(unittest.TestCase):
+def ref_logsumexp(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, int):
+        axis = (axis, )
+    elif isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    out = np.log(np.exp(x).sum(axis=axis, keepdims=keepdim))
+    return out
+
+
+class TestLogsumexp(OpTest):
+    def setUp(self):
+        self.op_type = 'logsumexp'
+        self.shape = [2, 3, 4, 5]
+        self.dtype = 'float64'
+        self.axis = [-1]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = ref_logsumexp(x, self.axis, self.keepdim, self.reduce_all)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+class TestLogsumexp_shape(TestLogsumexp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+class TestLogsumexp_axis(TestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, -1]
+
+
+class TestLogsumexp_axis_all(TestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestLogsumexp_keepdim(TestLogsumexp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestLogsumexp_reduce_all(TestLogsumexp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+class TestLogsumexpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            x1 = fluid.layers.data(name='x1', shape=[120], dtype="uint8")
-            self.assertRaises(Exception, paddle.logsumexp, x1)
-
-            x2 = fluid.layers.data(name='x2', shape=[2, 3], dtype="int")
-            self.assertRaises(Exception, paddle.logsumexp, x2)
-
-            x3 = fluid.layers.data(name='x3', shape=[3], dtype="float16")
-            self.assertRaises(Exception, paddle.logsumexp, x3)
-
-            self.assertRaises(AssertionError, paddle.logsumexp, None)
-
-
-class TestLogSumExpOp(unittest.TestCase):
-    def test_dygraph(self):
-        with fluid.dygraph.guard():
-            np_x = np.random.uniform(0.1, 1, [123]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(x).numpy(), np.log(np.sum(np.exp(np_x)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, dim=[1, 2]).numpy(),
-                    np.log(np.sum(np.exp(np_x), axis=(1, 2)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, dim=[2]).numpy(),
-                    np.log(np.sum(np.exp(np_x), axis=(2)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, keepdim=True).numpy(),
-                    np.log(np.sum(np.exp(np_x), keepdims=True))))
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.assertRaises(TypeError, paddle.logsumexp, 1)
+            x1 = paddle.data(name='x1', shape=[120], dtype="int32")
+            self.assertRaises(TypeError, paddle.logsumexp, x1)
+
+
+class TestLogsumexpAPI(unittest.TestCase):
+    def setUp(self):
+        self.shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def api_case(self, axis=None, keepdim=False):
+        out_ref = ref_logsumexp(self.x, axis, keepdim)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape)
+            out = paddle.logsumexp(x, axis, keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(res[0], out_ref))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x)
+        out = paddle.logsumexp(x, axis, keepdim)
+        self.assertTrue(np.allclose(out.numpy(), out_ref))
+        paddle.enable_static()
+
+    def test_api(self):
+        self.api_case()
+        self.api_case(2)
+        self.api_case([-1])
+        self.api_case([2, -3])
+        self.api_case((0, 1, -1))
+        self.api_case(keepdim=True)
+
+    def test_alias(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x)
+        out1 = paddle.logsumexp(x)
+        out2 = paddle.tensor.logsumexp(x)
+        out3 = paddle.tensor.math.logsumexp(x)
+        out_ref = ref_logsumexp(self.x)
+        for out in [out1, out2, out3]:
+            self.assertTrue(np.allclose(out.numpy(), out_ref))
+        paddle.enable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f655e363e964893a7ab4c0a966856f873800ff6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import copy
+import math
+import numpy as np
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
+import paddle.fluid.core as core
+
+
+def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
+                         var_list):
+    def is_better(current, best, m, n):
+        if m == 'min' and n == 'rel':
+            return current < best - best * threshold
+        elif m == 'min' and n == 'abs':
+            return current < best - threshold
+        elif m == 'max' and n == 'rel':
+            return current > best + best * threshold
+        else:  # mode == 'max' and epsilon_mode == 'abs':
+            return current > best + threshold
+
+    if var_list[2] > 0:
+        var_list[2] -= 1
+        return var_list[1]
+
+    if is_better(loss, var_list[0], m, n):
+        var_list[0] = loss
+        var_list[3] = 0
+    else:
+        var_list[3] += 1
+        if var_list[3] > patience:
+            var_list[2] = cooldown
+            var_list[3] = 0
+            new_lr = var_list[1] * decay_rate
+            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]
+
+    return var_list[1]
+
+
+class TestReduceLROnPlateauDecay(object):
+    def test_ReduceLR(self):
+        # the decay rate must be less than 1.0
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
+        # the mode must be "min" or "max"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
+        # the threshold_mode must be "rel" or "abs"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(
+                learning_rate=1.0, threshold_mode="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
+
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+
+        for place in places:
+            for m, n in zip(['min', 'max', 'min', 'max'],
+                            ['rel', 'rel', 'abs', 'abs']):
+                kwargs = {
+                    'learning_rate': 1.0,
+                    'mode': m,
+                    'factor': 0.5,
+                    'patience': 3,
+                    'threshold': 1e-4,
+                    'threshold_mode': n,
+                    'cooldown': 1,
+                    'min_lr': 0,
+                    'epsilon': 1e-8,
+                    'verbose': False,
+                }
+                paddle.enable_static()
+                self._test_static(place, kwargs)
+                paddle.disable_static(place)
+                self._test_dygraph(place, kwargs)
+                paddle.enable_static()
+
+    def _test_static(self, place, kwargs):
+        paddle.enable_static()
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            x = fluid.layers.create_global_var(
+                [1], 1, 'float32', persistable=True)
+            paddle.increment(x)
+            loss = paddle.sin(x)
+            scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+            adam = paddle.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        exe = paddle.static.Executor(place)
+        exe.run(start_prog)
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(main_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+        for epoch in range(10):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(test_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+    def _test_dygraph(self, place, kwargs):
+        paddle.disable_static(place)
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        adam = paddle.optimizer.Adam(
+            learning_rate=scheduler, parameters=linear.parameters())
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                x = paddle.to_tensor(epoch).astype('float32')
+                loss = paddle.sin(x)
+                loss.backward()
+                adam.step()
+                adam.clear_grad()
+
+            scheduler.step(loss)
+            # get lr from paddle
+            current_lr = adam.get_lr()
+            # get lr form python
+            expected_lr = reduce_lr_on_plateau(
+                kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'],
+                loss, var_list)
+            self.assertEqual(current_lr, expected_lr)
+        state_dict = adam.state_dict()
+        scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        adam1 = paddle.optimizer.Adam(
+            learning_rate=scheduler1, parameters=linear.parameters())
+        adam1.set_state_dict(state_dict)
+        self.assertEqual(scheduler.cooldown_counter,
+                         scheduler1.cooldown_counter)
+        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
+        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
+        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
+        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
+
+
+def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False):
+    if epoch_num == 0:
+        a = 1
+    else:
+        a = math.pow(epoch_num, -0.5)
+    b = math.pow(warmup_steps, -1.5) * epoch_num
+    return learning_rate * math.pow(d_model, -0.5) * min(a, b)
+
+
+def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
+    return learning_rate * lr_lambda(epoch_num)
+
+
+def piecewise_lr(epoch_num, boundaries, values, verbose=False):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if epoch_num < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
+
+
+def exponential_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * gamma**epoch_num
+
+
+def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * math.exp(-1 * gamma * epoch_num)
+
+
+def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate / (1 + gamma * epoch_num)
+
+
+def polynomial_lr(epoch_num,
+                  learning_rate,
+                  decay_steps,
+                  end_lr=0.0001,
+                  power=1.0,
+                  cycle=False,
+                  verbose=False):
+
+    if cycle:
+        div = math.ceil(epoch_num / float(decay_steps))
+        if epoch_num == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        epoch_num = min(epoch_num, decay_steps)
+    return (learning_rate - end_lr) * (
+        (1 - float(epoch_num) / float(decay_steps))**power) + end_lr
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+
+cosine_annealing_lr_current = None
+
+
+def cosine_annealing_lr(epoch_num,
+                        learning_rate,
+                        T_max,
+                        eta_min=0,
+                        verbose=False):
+    global cosine_annealing_lr_current
+    if epoch_num == 0:
+        cosine_annealing_lr_current = learning_rate
+    elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
+        cosine_annealing_lr_current = cosine_annealing_lr_current + (
+            learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max))
+                                        ) / 2
+    else:
+        cosine_annealing_lr_current = (1 + math.cos(
+            math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * (
+                epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current -
+                                                   eta_min) + eta_min
+    return cosine_annealing_lr_current
+
+
+def linear_warmup_lr(epoch_num,
+                     learning_rate,
+                     warmup_steps,
+                     start_lr,
+                     end_lr,
+                     verbose=False):
+    if epoch_num < warmup_steps:
+        return start_lr + (end_lr - start_lr) * (float(epoch_num) /
+                                                 float(warmup_steps))
+    else:
+        return learning_rate
+
+
+def multi_step_lr(epoch_num,
+                  learning_rate,
+                  milestones,
+                  gamma=0.1,
+                  verbose=False):
+    for i in range(len(milestones)):
+        if epoch_num < milestones[i]:
+            return learning_rate * (gamma**i)
+    return learning_rate * (gamma**len(milestones))
+
+
+def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
+    return learning_rate * math.pow(gamma, epoch_num // step_size)
+
+
+class TestLRScheduler(unittest.TestCase):
+    def _test_static(self, python_func, paddle_api, kwarg, place):
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            x = paddle.static.data(name='x', shape=[3, 4, 5])
+            y = paddle.static.data(name='y', shape=[3, 4, 5])
+            z = paddle.static.nn.fc(x, 100)
+            loss = paddle.mean(z)
+            scheduler = paddle_api(**kwarg)
+            adam = paddle.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        num = 0
+        exe = paddle.static.Executor(place)
+        exe.run(start_prog)
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    main_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    test_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        if isinstance(place, paddle.CPUPlace):
+            compiled_train_prog = paddle.static.CompiledProgram(
+                main_prog).with_data_parallel(
+                    loss_name=loss.name, places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_train_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_train_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+            compiled_test_prog = paddle.static.CompiledProgram(
+                test_prog).with_data_parallel(
+                    loss_name=loss.name,
+                    share_vars_from=compiled_train_prog,
+                    places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_test_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_test_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+    def _test_dygraph(self, python_func, paddle_api, kwarg, place):
+        paddle.disable_static(place)
+        x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle_api(**kwarg)
+        adam = paddle.optimizer.Adam(
+            learning_rate=scheduler, parameters=linear.parameters())
+        for epoch in range(20):
+            for batch_id in range(2):
+                x = paddle.to_tensor(x)
+                out = linear(x)
+                loss = paddle.reduce_mean(out)
+                loss.backward()
+                adam.step()
+                adam.clear_grad()
+            current_lr = adam.get_lr()
+            expected_lr = python_func(epoch, **kwarg)
+            if paddle_api.__name__ != "CosineAnnealingLR":
+                self.assertEqual(current_lr, expected_lr)
+                scheduler.step()
+            else:
+                self.assertAlmostEqual(current_lr, expected_lr)
+                scheduler.step(epoch + 1)
+
+    def test_scheduler(self):
+        with self.assertRaises(NotImplementedError):
+            paddle.optimizer.lr_scheduler._LRScheduler().step()
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate="test", milestones=[1, 2, 3])
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[3, 2, 1])
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+
+        func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
+            "d_model": 0.01,
+            "warmup_steps": 100,
+            "verbose": False
+        }), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
+            "boundaries": [3, 6, 9, 15, 20],
+            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "verbose": False
+        }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": True
+        }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": False
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": False,
+            "verbose": True
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": True,
+            "verbose": False
+        }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
+            'learning_rate': 0.5,
+            'warmup_steps': 20,
+            'start_lr': 0,
+            'end_lr': 0.5,
+            "verbose": True
+        }), (exponential_lr, paddle.optimizer.ExponentialLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.9,
+            "verbose": False
+        }), (multi_step_lr, paddle.optimizer.MultiStepLR, {
+            "learning_rate": 0.5,
+            "milestones": [3, 6, 9, 15, 20],
+            "gamma": 0.8,
+            "verbose": True
+        }), (step_lr, paddle.optimizer.StepLR, {
+            "learning_rate": 0.5,
+            "step_size": 2,
+            "gamma": 0.8,
+            "verbose": False
+        }), (lambda_lr, paddle.optimizer.LambdaLR, {
+            "learning_rate": 0.5,
+            "lr_lambda": lambda x: 0.95**x,
+            "verbose": True
+        }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
+            "learning_rate": 0.5,
+            "T_max": 10,
+            "verbose": False
+        })]
+
+        for python_func, paddle_api, kwarg in func_api_kwargs:
+            places = [paddle.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(paddle.CUDAPlace(0))
+
+            for place in places:
+                paddle.enable_static()
+                #self._test_static(python_func, paddle_api, kwarg, place)
+                paddle.disable_static(place)
+                self._test_dygraph(python_func, paddle_api, kwarg, place)
+                paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index d4189eca0369702120f079dae9067a58da1e9597..90430bbce4d1896c8fdbb829230f2ad8a691adff 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -20,15 +20,14 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 
 SIGMOID_THRESHOLD_MIN = -40.0
 SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
-def lstm_naive(
-        input,
-        w, ):
+def lstm_naive(input, w):
     seq_len, batch_size, hidden_size = input.shape
 
     offset = 0
@@ -86,8 +85,8 @@ def lstm_naive(
         return (2. / (1. + np.exp(y))) - 1.
 
     output = []
-    pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
-    pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+    pre_h = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
+    pre_c = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
 
     for i in range(seq_len):
         emb_1 = input[i]
@@ -110,7 +109,6 @@ def lstm_naive(
 
     output = np.concatenate(output, -1)
     output = output.reshape((batch_size, -1, hidden_size))
-
     output = output.transpose((1, 0, 2))
 
     return output, pre_h, pre_c
@@ -119,11 +117,12 @@ def lstm_naive(
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
+    # TODO(GaoWei8):when input dtype is fp64, precision threshold should be removed.
     def setUp(self):
         self.op_type = "cudnn_lstm"
-        self.dtype = np.float32
+        self.dtype = np.float64
 
-        num_steps = 20
+        seq_length = 20
         batch_size = 5
         hidden_size = 20
 
@@ -133,33 +132,24 @@ class TestCUDNNLstmOp(OpTest):
         weight_size += hidden_size * 8
 
         input = np.random.uniform(
-            low=-0.1, high=0.1, size=(num_steps, batch_size,
+            low=-0.1, high=0.1, size=(seq_length, batch_size,
                                       hidden_size)).astype(self.dtype)
         flat_w = np.random.uniform(
             low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
 
         output, last_hidden, last_cell = lstm_naive(input, flat_w)
 
-        init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
-        init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
-        scope = core.Scope()
-        program = fluid.Program()
-        block = program.global_block()
-
-        cache_temp = block.create_var(
-            name="Cache",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW,
-            stop_gradient=True)
+        init_h = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
+        init_c = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
+        state_out = np.ndarray((300)).astype("uint8")
+
         self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
-            'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
-            'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
+            'Input': input,
+            'W': flat_w,
+            'InitH': init_h,
+            'InitC': init_c
         }
-        self.cache_name_list = ['Cache']
         self.attrs = {
-            'max_len': num_steps,
             'dropout_prob': 0.0,
             'is_bidirec': False,
             'input_size': hidden_size,
@@ -168,22 +158,61 @@ class TestCUDNNLstmOp(OpTest):
         }
         self.outputs = {
             'Out': output,
-            "last_h": last_hidden,
-            'last_c': last_cell
+            "LastH": last_hidden,
+            'LastC': last_cell,
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'StateOut': state_out
         }
 
     def test_output_with_place(self):
         # depend on the scope structure
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(
+            place, no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
         # depend on the scope structure
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
             place,
-            set(['Input', 'W', 'InitH', 'InitC']), ['Out', 'last_h', 'last_c'],
-            check_dygraph=False)
+            set(['Input', 'W', 'InitH', 'InitC']), ['Out', 'LastH', 'LastC'],
+            max_relative_error=1e-4)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNlstmAPI(unittest.TestCase):
+    def test_lstm(self):
+        seq_len = 20
+        batch_size = 5
+        hidden_size = 20
+        dropout_prob = 0.0
+        num_layers = 1
+        input = fluid.data(
+            name='input',
+            shape=[seq_len, batch_size, hidden_size],
+            dtype='float64')
+        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
+                                              hidden_size, num_layers,
+                                              dropout_prob)
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        exe.run(fluid.default_startup_program())
+        input_i = np.random.uniform(
+            low=-0.1, high=0.1, size=(seq_len, batch_size,
+                                      hidden_size)).astype("float64")
+        out = exe.run(fluid.default_main_program(),
+                      feed={'input': input_i},
+                      fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
+
+        output, last_hidden, last_cell = lstm_naive(input_i, out[3])
+
+        self.assertTrue(np.allclose(output, out[0], atol=1e-5))
+        self.assertTrue(np.allclose(last_hidden, out[1], atol=1e-5))
+        self.assertTrue(np.allclose(last_cell, out[2], atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_masked_select_op.py b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..259a36e30d9a9c1852ff3800d5240ce7e7bb0e26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+
+
+def np_masked_select(x, mask):
+    result = np.empty(shape=(0), dtype=x.dtype)
+    for ele, ma in zip(np.nditer(x), np.nditer(mask)):
+        if ma:
+            result = np.append(result, ele)
+    return result.flatten()
+
+
+class TestMaskedSelectOp(OpTest):
+    def setUp(self):
+        self.init()
+        self.op_type = "masked_select"
+        x = np.random.random(self.shape).astype("float64")
+        mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
+        out = np_masked_select(x, mask)
+        self.inputs = {'X': x, 'Mask': mask}
+        self.outputs = {'Y': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+    def init(self):
+        self.shape = (50, 3)
+
+
+class TestMaskedSelectOp1(TestMaskedSelectOp):
+    def init(self):
+        self.shape = (6, 8, 9, 18)
+
+
+class TestMaskedSelectOp2(TestMaskedSelectOp):
+    def init(self):
+        self.shape = (168, )
+
+
+class TestMaskedSelectAPI(unittest.TestCase):
+    def test_imperative_mode(self):
+        paddle.disable_static()
+        shape = (88, 6, 8)
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+        x = paddle.to_tensor(np_x)
+        mask = paddle.to_tensor(np_mask)
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(np_x, np_mask)
+        self.assertEqual(np.allclose(out.numpy(), np_out), True)
+        paddle.enable_static()
+
+    def test_static_mode(self):
+        shape = [8, 9, 6]
+        x = paddle.data(shape=shape, dtype='float32', name='x')
+        mask = paddle.data(shape=shape, dtype='bool', name='mask')
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(np_x, np_mask)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+
+        res = exe.run(paddle.static.default_main_program(),
+                      feed={"x": np_x,
+                            "mask": np_mask},
+                      fetch_list=[out])
+        self.assertEqual(np.allclose(res, np_out), True)
+
+
+class TestMaskedSelectError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='float32', name='x')
+            mask = paddle.data(shape=shape, dtype='bool', name='mask')
+            mask_float = paddle.data(
+                shape=shape, dtype='float32', name='mask_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.masked_select(np_x, mask)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_mask_type():
+                paddle.masked_select(x, np_mask)
+
+            self.assertRaises(TypeError, test_mask_type)
+
+            def test_mask_dtype():
+                paddle.masked_select(x, mask_float)
+
+            self.assertRaises(TypeError, test_mask_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index f6eff22d6ce5f06d8853d6244f79b4b07b3fa4f5..00137f63e244a0e166047e89f9ef436da158ed16 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
     @prog_scope()
     def test_integer_div(self):
         a = fluid.layers.data(name="a", shape=[1], dtype='int64')
-        b = a / 7
+        b = a / 2
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
+        a_np = numpy.array([3, 4, 10, 14, 9, 18])
         b_np, = exe.run(fluid.default_main_program(),
                         feed={"a": a_np},
                         fetch_list=[b])
-
-        b_np_actual = (a_np / 7).astype('int64')
+        # for paddle2.0, use true_divide
+        b_np_actual = (a_np / 2.0)
         self.assertTrue(numpy.array_equal(b_np, b_np_actual))
 
     @prog_scope()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 803293be9b7d637875b56b443b04c246737ed2f8..9bb12d546550a821e8a133dd9c91d5d41a50b1b2 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
@@ -284,6 +285,223 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             self.assertEqual((a != b).dtype, fluid.core.VarDesc.VarType.BOOL)
             self.assertTrue(np.array_equal((a != b).numpy(), a_np != b_np))
 
+    def test_tensor_patch_method(self):
+        paddle.disable_static()
+        x_np = np.random.uniform(-1, 1, [2, 3]).astype(self.dtype)
+        y_np = np.random.uniform(-1, 1, [2, 3]).astype(self.dtype)
+        z_np = np.random.uniform(-1, 1, [6, 9]).astype(self.dtype)
+
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        z = paddle.to_tensor(z_np)
+
+        a = paddle.to_tensor([[1, 1], [2, 2], [3, 3]])
+        b = paddle.to_tensor([[1, 1], [2, 2], [3, 3]])
+
+        # 1. Unary operation for Tensor
+        self.assertEqual(x.dim(), 2)
+        self.assertEqual(x.ndimension(), 2)
+        self.assertEqual(x.ndim, 2)
+        self.assertEqual(x.size(), [2, 3])
+        self.assertTrue(
+            np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.logsigmoid().numpy(),
+                           fluid.layers.logsigmoid(x).numpy()))
+        self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.tanh_shrink().numpy(),
+                           fluid.layers.tanh_shrink(x).numpy()))
+        self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
+        m = x.abs()
+        self.assertTrue(
+            np.array_equal(m.sqrt().numpy(), paddle.sqrt(m).numpy()))
+        self.assertTrue(
+            np.array_equal(m.rsqrt().numpy(), paddle.rsqrt(m).numpy()))
+        self.assertTrue(
+            np.array_equal(x.ceil().numpy(), paddle.ceil(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.floor().numpy(), paddle.floor(x).numpy()))
+        self.assertTrue(np.array_equal(x.cos().numpy(), paddle.cos(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.acos().numpy(), paddle.acos(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.asin().numpy(), paddle.asin(x).numpy()))
+        self.assertTrue(np.array_equal(x.sin().numpy(), paddle.sin(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.sinh().numpy(), paddle.sinh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.cosh().numpy(), paddle.cosh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.round().numpy(), paddle.round(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.reciprocal().numpy(), paddle.reciprocal(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.softplus().numpy(),
+                           fluid.layers.softplus(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.softsign().numpy(),
+                           fluid.layers.softsign(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
+        m = paddle.to_tensor(np.random.uniform(1, 2, [3, 3]), 'float32')
+        m = m.matmul(m.t())
+        self.assertTrue(
+            np.array_equal(m.cholesky().numpy(), paddle.cholesky(m).numpy()))
+
+        self.assertTrue(
+            np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.isfinite().numpy(), paddle.isfinite(x).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.cast('int32').numpy(), paddle.cast(x, 'int32').numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.expand([3, 2, 3]).numpy(),
+                paddle.expand(x, [3, 2, 3]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.tile([2, 2]).numpy(), paddle.tile(x, [2, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.flatten().numpy(), paddle.flatten(x).numpy()))
+        index = paddle.to_tensor([0, 1])
+        self.assertTrue(
+            np.array_equal(
+                x.gather(index).numpy(), paddle.gather(x, index).numpy()))
+        index = paddle.to_tensor([[0, 1], [1, 2]])
+        self.assertTrue(
+            np.array_equal(
+                x.gather_nd(index).numpy(), paddle.gather_nd(x, index).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.reverse([0, 1]).numpy(), paddle.reverse(x, [0, 1]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                a.reshape([3, 2]).numpy(), paddle.reshape(a, [3, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.slice([0, 1], [0, 0], [1, 2]).numpy(),
+                paddle.slice(x, [0, 1], [0, 0], [1, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.split(2)[0].numpy(), paddle.split(x, 2)[0].numpy()))
+        m = paddle.to_tensor(
+            np.random.uniform(-1, 1, [1, 6, 1, 1]).astype(self.dtype))
+        self.assertTrue(
+            np.array_equal(
+                m.squeeze([]).numpy(), paddle.squeeze(m, []).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                m.squeeze([1, 2]).numpy(), paddle.squeeze(m, [1, 2]).numpy()))
+        m = paddle.to_tensor([2, 3, 3, 1, 5, 3], 'float32')
+        self.assertTrue(
+            np.array_equal(m.unique()[0].numpy(), paddle.unique(m)[0].numpy()))
+        self.assertTrue(
+            np.array_equal(m.unique_with_counts()[2],
+                           paddle.unique_with_counts(m)[2]))
+        self.assertTrue(np.array_equal(x.flip([0]), paddle.flip(x, [0])))
+        self.assertTrue(np.array_equal(x.unbind(0), paddle.unbind(x, 0)))
+        self.assertTrue(np.array_equal(x.roll(1), paddle.roll(x, 1)))
+        self.assertTrue(np.array_equal(x.cumsum(1), paddle.cumsum(x, 1)))
+        m = paddle.to_tensor(1)
+        self.assertTrue(np.array_equal(m.increment(), paddle.increment(m)))
+        m = x.abs()
+        self.assertTrue(np.array_equal(m.log(), paddle.log(m)))
+        self.assertTrue(np.array_equal(x.pow(2), paddle.pow(x, 2)))
+        self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x)))
+
+        # 2. Binary operation
+        self.assertTrue(
+            np.array_equal(
+                x.matmul(y, True, False).numpy(),
+                paddle.matmul(x, y, True, False).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.norm(
+                    p='fro', axis=[0, 1]).numpy(),
+                paddle.norm(
+                    x, p='fro', axis=[0, 1]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.dist(y).numpy(), paddle.dist(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(x.cross(y).numpy(), paddle.cross(x, y).numpy()))
+        m = x.expand([2, 2, 3])
+        n = y.expand([2, 2, 3]).transpose([0, 2, 1])
+        self.assertTrue(
+            np.array_equal(m.bmm(n).numpy(), paddle.bmm(m, n).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.histogram(5, -1, 1).numpy(),
+                paddle.histogram(x, 5, -1, 1).numpy()))
+        self.assertTrue(
+            np.array_equal(x.equal(y).numpy(), paddle.equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.greater_equal(y).numpy(), paddle.greater_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.greater_than(y).numpy(), paddle.greater_than(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.less_equal(y).numpy(), paddle.less_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.less_than(y).numpy(), paddle.less_than(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.not_equal(y).numpy(), paddle.not_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.equal_all(y).numpy(), paddle.equal_all(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.allclose(y).numpy(), paddle.allclose(x, y).numpy()))
+        m = x.expand([2, 2, 3])
+        self.assertTrue(
+            np.array_equal(
+                x.expand_as(m).numpy(), paddle.expand_as(x, m).numpy()))
+        index = paddle.to_tensor([2, 1, 0])
+        self.assertTrue(
+            np.array_equal(
+                a.scatter(index, b).numpy(),
+                paddle.scatter(a, index, b).numpy()))
+
+        # 3. Bool tensor operation
+        x = paddle.to_tensor([[True, False], [True, False]])
+        y = paddle.to_tensor([[False, False], [False, True]])
+        self.assertTrue(
+            np.array_equal(x.reduce_all().numpy(), paddle.reduce_all(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.reduce_any().numpy(), paddle.reduce_any(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_not(y).numpy(), paddle.logical_not(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_or(y).numpy(), paddle.logical_or(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_xor(y).numpy(), paddle.logical_xor(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..884139a23d51c95c79439b91d501dc935baeae36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -0,0 +1,336 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+class TestMatMulV2Op(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.config()
+        self.op_type = "matmul_v2"
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+        self.dtype = "float64"
+
+
+class TestMatMuklOp3(TestMatMulV2Op):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp5(TestMatMulV2Op):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 100, 2)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 1, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp8(TestMatMulV2Op):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 1, 100)
+        self.y_shape = (2, 1, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+        self.dtype = "float64"
+
+
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100, 2)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp13(TestMatMulV2Op):
+    """
+    case 13
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 100, 2)
+        self.y_shape = (2, 2, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 1, 100, 2)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 1, 2, 100)
+        self.y_shape = (1, 2, 2, 100, 1)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 1)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp17(TestMatMulV2Op):
+    """
+    case 17 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMulV2API(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32")
+            input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32")
+
+            result = paddle.matmul(input_x, input_y)
+
+            x_np = np.random.random([4, 3]).astype("float32")
+            y_np = np.random.random([3, 4]).astype("float32")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input_x": x_np,
+                                    "input_y": y_np},
+                              fetch_list=[result])
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float64")
+                input_y = np.random.random([3, 4]).astype("float64")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9afc4bec66f2927a674ac15e807fe01f724c64f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMaxTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+    def test_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="float32")
+            result_max = paddle.max(x=data, axis=1)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.rand(10, 10).astype(np.float32)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
+        self.assertEqual((res == np.max(input_data, axis=1)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_max = paddle.max(x=data, axis=0)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
+        self.assertEqual((res == np.max(input_data, axis=0)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_max = paddle.max(x=data, axis=(0, 1))
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
+        self.assertEqual((res == np.max(input_data, axis=(0, 1))).all(), True)
+
+    def test_errors(self):
+        paddle.enable_static()
+
+        def test_input_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = np.random.rand(10, 10)
+                result_max = paddle.max(x=data, axis=0)
+
+        self.assertRaises(TypeError, test_input_type)
+
+        def test_axis_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+                axis = paddle.static.data("axis", shape=[10, 10], dtype="int64")
+                result_min = paddle.min(data, axis)
+
+        self.assertRaises(TypeError, test_axis_type)
+
+    def test_imperative_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(np_x)
+        z = paddle.max(x, axis=0)
+        np_z = z.numpy()
+        z_expected = np.array(np.max(np_x, axis=0))
+        self.assertEqual((np_z == z_expected).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5645597007a00cac9c75ec1ae90bc00a5bc75f22
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMaximumTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+        self.input_x = np.random.rand(10, 15).astype("float32")
+        self.input_y = np.random.rand(10, 15).astype("float32")
+        self.input_z = np.random.rand(15).astype("float32")
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
+            result_max = paddle.maximum(data_x, data_y)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "y": self.input_y},
+                           fetch_list=[result_max])
+        self.assertEqual((res == np.maximum(self.input_x, self.input_y)).all(),
+                         True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_z = paddle.static.data("z", shape=[15], dtype="float32")
+            result_max = paddle.maximum(data_x, data_z, axis=1)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "z": self.input_z},
+                           fetch_list=[result_max])
+        self.assertEqual((res == np.maximum(self.input_x, self.input_z)).all(),
+                         True)
+
+    def test_dynamic_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        z = paddle.maximum(x, y)
+        np_z = z.numpy()
+        z_expected = np.array(np.maximum(self.input_x, self.input_y))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_broadcast_axis(self):
+        paddle.disable_static()
+        np_x = np.random.rand(5, 4, 3, 2).astype("float64")
+        np_y = np.random.rand(4, 3).astype("float64")
+
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        result_1 = paddle.maximum(x, y, axis=1)
+        result_2 = paddle.maximum(x, y, axis=-2)
+        self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index f3abd1acce6fafb8d187bfbe82765f982acae010..29e79b096cf790858e8e07aedc5c6b76881e8f82 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -17,10 +17,13 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
+np.random.seed(10)
+
 
 class TestMeanOp(OpTest):
     def setUp(self):
@@ -73,5 +76,182 @@ class TestFP16MeanOp(TestMeanOp):
                 place, ['X'], 'Out', max_relative_error=0.8)
 
 
+def ref_reduce_mean(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    return np.mean(x, axis=axis, keepdims=keepdim)
+
+
+class TestReduceMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = 'reduce_mean'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+        self.axis = [0]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out_np = ref_reduce_mean(x_np, self.axis, self.keepdim, self.reduce_all)
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np}
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+class TestReduceMeanOpDefaultAttrs(TestReduceMeanOp):
+    def setUp(self):
+        self.op_type = 'reduce_mean'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+
+        x_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out_np = np.mean(x_np, axis=0)
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np}
+
+
+class TestReduceMeanOpFloat32(TestReduceMeanOp):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestReduceMeanOpShape1D(TestReduceMeanOp):
+    def set_attrs(self):
+        self.shape = [100]
+
+
+class TestReduceMeanOpShape6D(TestReduceMeanOp):
+    def set_attrs(self):
+        self.shape = [2, 3, 4, 5, 6, 7]
+
+
+class TestReduceMeanOpAxisAll(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestReduceMeanOpAxisTuple(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = (0, 1, 2)
+
+
+class TestReduceMeanOpAxisNegative(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [-2, -1]
+
+
+class TestReduceMeanOpKeepdimTrue1(TestReduceMeanOp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestReduceMeanOpKeepdimTrue2(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+        self.keepdim = True
+
+
+class TestReduceMeanOpReduceAllTrue(TestReduceMeanOp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+class TestMeanAPI(unittest.TestCase):
+    # test paddle.tensor.stat.mean
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_shape)
+            out1 = paddle.mean(x)
+            out2 = paddle.tensor.mean(x)
+            out3 = paddle.tensor.stat.mean(x)
+            axis = np.arange(len(self.x_shape)).tolist()
+            out4 = paddle.mean(x, axis)
+            out5 = paddle.mean(x, tuple(axis))
+
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x},
+                          fetch_list=[out1, out2, out3, out4, out5])
+        out_ref = np.mean(self.x)
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, axis=None, keepdim=False):
+            x_tensor = paddle.to_variable(x)
+            out = paddle.mean(x_tensor, axis, keepdim)
+            if isinstance(axis, list):
+                axis = tuple(axis)
+                if len(axis) == 0:
+                    axis = None
+            out_ref = np.mean(x, axis, keepdims=keepdim)
+            self.assertEqual(
+                np.allclose(
+                    out.numpy(), out_ref, rtol=1e-04), True)
+
+        test_case(self.x)
+        test_case(self.x, [])
+        test_case(self.x, -1)
+        test_case(self.x, keepdim=True)
+        test_case(self.x, 2, keepdim=True)
+        test_case(self.x, [0, 2])
+        test_case(self.x, (0, 2))
+        test_case(self.x, [0, 1, 2, 3])
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data("x", shape=[10, 10], dtype="float32")
+            out = fluid.layers.reduce_mean(input=x, dim=1)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            x_np = np.random.rand(10, 10).astype(np.float32)
+            res = exe.run(feed={"x": x_np}, fetch_list=[out])
+        self.assertEqual(np.allclose(res[0], np.mean(x_np, axis=1)), True)
+
+        with fluid.dygraph.guard():
+            x_np = np.random.rand(10, 10).astype(np.float32)
+            x = fluid.dygraph.to_variable(x_np)
+            out = fluid.layers.reduce_mean(input=x, dim=1)
+        self.assertEqual(np.allclose(out.numpy(), np.mean(x_np, axis=1)), True)
+
+    def test_errors(self):
+        paddle.disable_static()
+        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        x = paddle.to_tensor(x)
+        self.assertRaises(Exception, paddle.mean, x, -3)
+        self.assertRaises(Exception, paddle.mean, x, 2)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12], 'int32')
+            self.assertRaises(TypeError, paddle.mean, x)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_metrics.py b/python/paddle/fluid/tests/unittests/test_metrics.py
deleted file mode 100644
index ec27884cae2b0462951f6597b1b83e58d1c8af5d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_metrics.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import Program, program_guard
-
-
-class TestMetricsDetectionMap(unittest.TestCase):
-    def test_detection_map(self):
-        program = fluid.Program()
-        with program_guard(program):
-            detect_res = fluid.layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = fluid.layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            box = fluid.layers.data(
-                name='bbox',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            map_eval = fluid.metrics.DetectionMAP(
-                detect_res, label, box, class_num=21)
-            cur_map, accm_map = map_eval.get_map_var()
-            self.assertIsNotNone(cur_map)
-            self.assertIsNotNone(accm_map)
-        print(str(program))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9eff05c5ea9fb585421b6f99bf55b3bb95bf9ff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMinTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+    def test_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="float32")
+            result_min = paddle.min(x=data, axis=1)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.rand(10, 10).astype(np.float32)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
+        self.assertEqual((res == np.min(input_data, axis=1)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_min = paddle.min(x=data, axis=0)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
+        self.assertEqual((res == np.min(input_data, axis=0)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_min = paddle.min(x=data, axis=(0, 1))
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
+        self.assertEqual((res == np.min(input_data, axis=(0, 1))).all(), True)
+
+    def test_errors(self):
+        paddle.enable_static()
+
+        def test_input_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = np.random.rand(10, 10)
+                result_min = paddle.min(x=data, axis=0)
+
+        self.assertRaises(TypeError, test_input_type)
+
+        def test_axis_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+                axis = paddle.static.data("axis", shape=[10, 10], dtype="int64")
+                result_min = paddle.min(data, axis)
+
+        self.assertRaises(TypeError, test_axis_type)
+
+    def test_imperative_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(np_x)
+        z = paddle.min(x, axis=0)
+        np_z = z.numpy()
+        z_expected = np.array(np.min(np_x, axis=0))
+        self.assertEqual((np_z == z_expected).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c08b7386ca2c5da04c0a289872dacf68a2ea040
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMinimumTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+        self.input_x = np.random.rand(10, 15).astype("float32")
+        self.input_y = np.random.rand(10, 15).astype("float32")
+        self.input_z = np.random.rand(15).astype("float32")
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
+            result_min = paddle.minimum(data_x, data_y)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "y": self.input_y},
+                           fetch_list=[result_min])
+        self.assertEqual((res == np.minimum(self.input_x, self.input_y)).all(),
+                         True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_z = paddle.static.data("z", shape=[15], dtype="float32")
+            result_min = paddle.minimum(data_x, data_z, axis=1)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "z": self.input_z},
+                           fetch_list=[result_min])
+        self.assertEqual((res == np.minimum(self.input_x, self.input_z)).all(),
+                         True)
+
+    def test_dynamic_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        z = paddle.minimum(x, y)
+        np_z = z.numpy()
+        z_expected = np.array(np.minimum(self.input_x, self.input_y))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_broadcast_axis(self):
+        paddle.disable_static()
+        np_x = np.random.rand(5, 4, 3, 2).astype("float64")
+        np_y = np.random.rand(4, 3).astype("float64")
+
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        result_1 = paddle.minimum(x, y, axis=1)
+        result_2 = paddle.minimum(x, y, axis=-2)
+        self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index 2d4c8f61c0406d2da8a3ae54f197dfcca26a3f12..f6207edb41c190ac51dfe67dad22bb0191a67a07 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -52,7 +52,7 @@ class TestDatasetWithStat(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.fleet.DatasetFactory().create_dataset(
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
             "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index 89052396cf94615aab0841090430509c38b8423f..753d96c44114a552f4bdd299602d7f13f672efbf 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -69,6 +69,7 @@ class TestNNMseLoss(unittest.TestCase):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
             label_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -106,6 +107,7 @@ class TestNNMseLoss(unittest.TestCase):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
             label_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -143,6 +145,7 @@ class TestNNMseLoss(unittest.TestCase):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
             label_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -177,5 +180,112 @@ class TestNNMseLoss(unittest.TestCase):
             self.assertTrue(dy_result.shape, [1])
 
 
+class TestNNFunctionalMseLoss(unittest.TestCase):
+    def test_NNFunctionalMseLoss_mean(self):
+        for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
+            input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            target_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                input = paddle.data(name='input', shape=dim, dtype='float32')
+                target = paddle.data(name='target', shape=dim, dtype='float32')
+                mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+            static_result = exe.run(
+                prog,
+                feed={"input": input_np,
+                      "target": target_np},
+                fetch_list=[mse_loss])
+
+            paddle.disable_static()
+            dy_ret = paddle.nn.functional.mse_loss(
+                paddle.to_variable(input_np),
+                paddle.to_variable(target_np), 'mean')
+            dy_result = dy_ret.numpy()
+
+            sub = input_np - target_np
+            expected = np.mean(sub * sub)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            self.assertTrue(dy_result.shape, [1])
+
+    def test_NNFunctionalMseLoss_sum(self):
+        for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
+            input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            target_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                input = paddle.data(name='input', shape=dim, dtype='float32')
+                target = paddle.data(name='target', shape=dim, dtype='float32')
+                mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+                static_result = exe.run(
+                    prog,
+                    feed={"input": input_np,
+                          "target": target_np},
+                    fetch_list=[mse_loss])
+
+            paddle.disable_static()
+            dy_ret = paddle.nn.functional.mse_loss(
+                paddle.to_variable(input_np),
+                paddle.to_variable(target_np), 'sum')
+            dy_result = dy_ret.numpy()
+
+            sub = input_np - target_np
+            expected = np.sum(sub * sub)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            self.assertTrue(dy_result.shape, [1])
+
+    def test_NNFunctionalMseLoss_none(self):
+        for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
+            input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            target_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                input = paddle.data(name='input', shape=dim, dtype='float32')
+                target = paddle.data(name='target', shape=dim, dtype='float32')
+                mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+                static_result = exe.run(
+                    prog,
+                    feed={"input": input_np,
+                          "target": target_np},
+                    fetch_list=[mse_loss])
+
+            paddle.disable_static()
+            dy_ret = paddle.nn.functional.mse_loss(
+                paddle.to_variable(input_np),
+                paddle.to_variable(target_np), 'none')
+            dy_result = dy_ret.numpy()
+
+            sub = input_np - target_np
+            expected = sub * sub
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            self.assertTrue(dy_result.shape, [1])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 8ca06aa952184daec6be59a09330c8f16f6ee1d6..5f223de1954f7b401ac031265cca8c2e661c7392 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -175,5 +175,57 @@ class TestFP16MulOp2(TestMulOp2):
                 no_grad_set=set('Y'))
 
 
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
old mode 100644
new mode 100755
index 64421f6a1c6a018fdf82a7518f647099830972b3..dbf167617a24f36a36aff52d996a50ca3ebb6672
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -26,8 +26,10 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_static_graph_case(self, x_data, y_data, axis=-1):
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = paddle.nn.data(name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = paddle.static.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = paddle.static.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
             res = tensor.multiply(x, y, axis=axis)
 
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -41,9 +43,9 @@ class TestMultiplyAPI(unittest.TestCase):
             return res
 
     def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
-        paddle.enable_imperative()
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
+        paddle.disable_static()
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         res = paddle.multiply(x, y, axis=axis)
         return res.numpy()
 
@@ -107,34 +109,48 @@ class TestMultiplyError(unittest.TestCase):
     def test_errors(self):
         """test_errors."""
         # test static computation graph: dtype can not be int8
-        paddle.disable_imperative()
+        paddle.enable_static()
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=[100], dtype=np.int8)
-            y = paddle.nn.data(name='y', shape=[100], dtype=np.int8)
+            x = paddle.static.data(name='x', shape=[100], dtype=np.int8)
+            y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
             self.assertRaises(TypeError, tensor.multiply, x, y)
 
         # test static computation graph: inputs must be broadcastable 
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=[20, 50], dtype=np.float64)
-            y = paddle.nn.data(name='y', shape=[20], dtype=np.float64)
+            x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
+            y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
             self.assertRaises(fluid.core.EnforceNotMet, tensor.multiply, x, y)
 
         np.random.seed(7)
         # test dynamic computation graph: dtype can not be int8
-        paddle.enable_imperative()
+        paddle.disable_static()
         x_data = np.random.randn(200).astype(np.int8)
         y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         x_data = np.random.rand(200, 5)
         y_data = np.random.rand(200)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
+        # test dynamic computation graph: inputs must be broadcastable(python)
+        x_data = np.random.rand(200, 5)
+        y_data = np.random.rand(200)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+        # test dynamic computation graph: dtype must be same
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e2f9562b453b7faf40d4fc421dcea4967724025
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.io import TensorDataset, DataLoader
+from paddle.fluid.dygraph.base import to_variable
+
+
+class TestTensorDataset(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([16, 3, 4]).astype('float32')
+            input = to_variable(input_np)
+            label_np = np.random.random([16, 1]).astype('int32')
+            label = to_variable(label_np)
+
+            dataset = TensorDataset([input, label])
+            assert len(dataset) == 16
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=num_workers,
+                batch_size=1,
+                drop_last=True)
+
+            for i, (input, label) in enumerate(dataloader()):
+                assert len(input) == 1
+                assert len(label) == 1
+                assert input.shape == [1, 3, 4]
+                assert label.shape == [1, 1]
+                assert isinstance(input, paddle.Tensor)
+                assert isinstance(label, paddle.Tensor)
+                assert np.allclose(input.numpy(), input_np[i])
+                assert np.allclose(label.numpy(), label_np[i])
+
+    def test_main(self):
+        for p in [fluid.CPUPlace(), fluid.CUDAPlace(0)]:
+            for num_workers in [0, 2]:
+                ret = self.run_main(num_workers=num_workers, places=p)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index f3b15835b9e6f2797a2c76758d0b42db3d50ff27..3a8867f6bd29f5bc0e512f9c8b22ecf192253fc7 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -24,7 +24,7 @@ import numpy as np
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.io import Dataset, BatchSampler, DataLoader
+from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
 
@@ -108,6 +108,48 @@ class TestDataLoaderAssert(unittest.TestCase):
                 self.assertTrue(False)
 
 
+class TestDatasetRuntimeError(unittest.TestCase):
+    def test_main(self):
+        dataset = Dataset()
+
+        # __getitem__ not implement
+        try:
+            d = dataset[0]
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        # __len__ not implement
+        try:
+            l = len(dataset)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        dataset = IterableDataset()
+
+        # __iter__ not implement
+        try:
+            d = iter(dataset)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        # __getitem__ runtime error
+        try:
+            d = dataset[0]
+            self.assertTrue(False)
+        except RuntimeError:
+            pass
+
+        # __len__ runtime error
+        try:
+            l = len(dataset)
+            self.assertTrue(False)
+        except RuntimeError:
+            pass
+
+
 # CI Converage cannot record stub in subprocess,
 # HACK a _worker_loop in main process call here
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -144,12 +186,15 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
                 loader._worker_loop(
-                    loader._dataset, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0)
+                    loader._dataset, 0, indices_queue, loader._data_queue,
+                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
                 self.assertTrue(False)
         except AssertionError:
             pass
-        except Exception:
+        except Exception as e:
+            print("Exception", e)
+            import sys
+            sys.stdout.flush()
             self.assertTrue(False)
 
     def run_with_worker_done(self, use_shared_memory=True):
@@ -184,8 +229,8 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                 indices_queue.put(None)
                 loader._workers_done_event.set()
                 loader._worker_loop(
-                    loader._dataset, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0)
+                    loader._dataset, 0, indices_queue, loader._data_queue,
+                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
                 self.assertTrue(True)
         except AssertionError:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0209406fdff1d4f7659b15d5e6bd8af74fd0f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import sys
+import six
+import time
+import unittest
+import multiprocessing
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import Dataset, BatchSampler, DataLoader
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.base import to_variable
+
+from test_multiprocess_dataloader_iterable_dataset_static import RandomDataset, prepare_places
+from test_multiprocess_dataloader_iterable_dataset_static import EPOCH_NUM, BATCH_SIZE, IMAGE_SIZE, SAMPLE_NUM, CLASS_NUM
+
+
+class SimpleFCNet(fluid.dygraph.Layer):
+    def __init__(self):
+        super(SimpleFCNet, self).__init__()
+
+        param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.8))
+        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5))
+        self._fcs = []
+        in_channel = IMAGE_SIZE
+        for hidden_size in [10, 20, 30]:
+            self._fcs.append(
+                Linear(
+                    in_channel,
+                    hidden_size,
+                    act='tanh',
+                    param_attr=param_attr,
+                    bias_attr=bias_attr))
+            in_channel = hidden_size
+        self._fcs.append(
+            Linear(
+                in_channel,
+                CLASS_NUM,
+                act='softmax',
+                param_attr=param_attr,
+                bias_attr=bias_attr))
+
+    def forward(self, image):
+        out = image
+        for fc in self._fcs:
+            out = fc(out)
+        return out
+
+
+class TestDygraphDataLoader(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        with fluid.dygraph.guard(places[0]):
+            fc_net = SimpleFCNet()
+            optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
+
+            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                places=places,
+                num_workers=num_workers,
+                batch_size=BATCH_SIZE,
+                drop_last=True)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for _ in six.moves.range(EPOCH_NUM):
+                step = 0
+                for image, label in dataloader():
+                    out = fc_net(image)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    fc_net.clear_gradients()
+
+                    loss_list.append(np.mean(avg_loss.numpy()))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+    def test_main(self):
+        # dynamic graph do not run with_data_parallel
+        for p in prepare_places(False):
+            results = []
+            for num_workers in [0, 2]:
+                print(self.__class__.__name__, p, num_workers)
+                sys.stdout.flush()
+                ret = self.run_main(num_workers=num_workers, places=p)
+                results.append(ret)
+            assert results[0]['loss'].shape[0] * 2 == results[1]['loss'].shape[
+                0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..562051335850a5b665580981d2c41a20c8fe7575
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import math
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import IterableDataset, BatchSampler, DataLoader, get_worker_info
+
+
+class RangeIterableDatasetSplit(IterableDataset):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __iter__(self):
+        worker_info = get_worker_info()
+        if worker_info is None:
+            iter_start = self.start
+            iter_end = self.end
+        else:
+            per_worker = int(
+                math.ceil((self.end - self.start) / float(
+                    worker_info.num_workers)))
+            worker_id = worker_info.id
+            iter_start = self.start + worker_id * per_worker
+            iter_end = min(iter_start + per_worker, self.end)
+
+        for i in range(iter_start, iter_end):
+            yield np.array([i])
+
+
+class TestDynamicDataLoaderIterSplit(unittest.TestCase):
+    def test_main(self):
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            dataset = RangeIterableDatasetSplit(0, 10)
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=2,
+                batch_size=1,
+                drop_last=True)
+
+            rets = []
+            for d in dataloader:
+                rets.append(d[0].numpy()[0][0])
+
+            assert tuple(sorted(rets)) == tuple(range(0, 10))
+
+
+class RangeIterableDataset(IterableDataset):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __iter__(self):
+        for i in range(self.start, self.end):
+            yield np.array([i])
+
+
+class TestDynamicDataLoaderIterInitFuncSplit(unittest.TestCase):
+    def test_main(self):
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            dataset = RangeIterableDataset(0, 10)
+
+            def worker_spliter(worker_id):
+                worker_info = get_worker_info()
+
+                dataset = worker_info.dataset
+                start = dataset.start
+                end = dataset.end
+                num_per_worker = int(
+                    math.ceil((end - start) / float(worker_info.num_workers)))
+
+                worker_id = worker_info.id
+                dataset.start = start + worker_id * num_per_worker
+                dataset.end = min(dataset.start + num_per_worker, end)
+
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=1,
+                batch_size=1,
+                drop_last=True,
+                worker_init_fn=worker_spliter)
+
+            rets = []
+            for d in dataloader:
+                rets.append(d[0].numpy()[0][0])
+
+            assert tuple(sorted(rets)) == tuple(range(0, 10))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64e11d156ec74a375c161926ce3671e83f2352a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import sys
+import six
+import time
+import unittest
+import multiprocessing
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import IterableDataset, BatchSampler, DataLoader, get_worker_info
+
+EPOCH_NUM = 2
+BATCH_SIZE = 8
+IMAGE_SIZE = 32
+SAMPLE_NUM = 80
+CLASS_NUM = 10
+
+
+class RandomDataset(IterableDataset):
+    def __init__(self, sample_num, class_num):
+        self.sample_num = sample_num
+        self.class_num = class_num
+
+    def __iter__(self):
+        for i in range(self.sample_num):
+            np.random.seed(i)
+            image = np.random.random([IMAGE_SIZE]).astype('float32')
+            label = np.random.randint(0, self.class_num - 1,
+                                      (1, )).astype('int64')
+            yield image, label
+
+
+def simple_fc_net_static():
+    startup_prog = fluid.Program()
+    main_prog = fluid.Program()
+    startup_prog.random_seed = 1
+    main_prog.random_seed = 1
+
+    with fluid.unique_name.guard():
+        with fluid.program_guard(main_prog, startup_prog):
+            image = fluid.data(
+                name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = image
+            param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.8))
+            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.5))
+            for hidden_size in [10, 20, 30]:
+                hidden = fluid.layers.fc(hidden,
+                                         size=hidden_size,
+                                         act='tanh',
+                                         param_attr=param_attr,
+                                         bias_attr=bias_attr)
+
+            predict_label = fluid.layers.fc(hidden,
+                                            size=CLASS_NUM,
+                                            act='softmax',
+                                            param_attr=param_attr,
+                                            bias_attr=bias_attr)
+            loss = fluid.layers.reduce_mean(
+                fluid.layers.cross_entropy(
+                    input=predict_label, label=label))
+
+            optimizer = fluid.optimizer.Adam()
+            optimizer.minimize(loss)
+    return startup_prog, main_prog, image, label, loss
+
+
+def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
+    places = []
+    if with_cpu:
+        places.append([fluid.CPUPlace()])
+        if with_data_parallel:
+            places.append([fluid.CPUPlace()] * 2)
+
+    if with_gpu and fluid.core.is_compiled_with_cuda():
+        tmp = fluid.cuda_places()[:2]
+        assert len(tmp) > 0, "no gpu detected"
+        if with_data_parallel:
+            places.append(tmp)
+        places.append([tmp[0]])
+    return places
+
+
+class TestStaticDataLoader(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            startup_prog, main_prog, image, label, loss = simple_fc_net_static()
+
+            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                feed_list=[image, label],
+                places=places,
+                num_workers=num_workers,
+                batch_size=BATCH_SIZE,
+                drop_last=True)
+            # assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
+
+            exe = fluid.Executor(place=places[0])
+            exe.run(startup_prog)
+
+            prog = fluid.CompiledProgram(main_prog)
+            if len(places) > 1:
+                prog = prog.with_data_parallel(
+                    loss_name=loss.name, places=places)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for i in six.moves.range(EPOCH_NUM):
+                step = 0
+                for d in dataloader:
+                    assert len(d) == len(places), "{} != {}".format(
+                        len(d), len(places))
+                    for i, item in enumerate(d):
+                        image = item['image']
+                        label = item['label']
+                        assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
+                        assert label.shape() == [BATCH_SIZE, 1]
+                        assert image._place()._equals(places[i])
+                        assert label._place()._equals(places[i])
+                    L, = exe.run(program=prog,
+                                 feed=d,
+                                 fetch_list=[loss],
+                                 use_program_cache=True)
+                    loss_list.append(np.mean(L))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+    def test_main(self):
+        for p in prepare_places(True):
+            results = []
+            for num_workers in [0, 2]:
+                print(self.__class__.__name__, p, num_workers)
+                sys.stdout.flush()
+                ret = self.run_main(num_workers=num_workers, places=p)
+                results.append(ret)
+            assert results[0]['loss'].shape[0] * 2 == results[1]['loss'].shape[
+                0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index e5f44403a91f5167996359a233aee37bf622db9d..38497f91fc18847e40efa691a65c2a7adc20e51c 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -137,14 +137,8 @@ class TestStaticDataLoader(unittest.TestCase):
                         label = item['label']
                         assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
                         assert label.shape() == [BATCH_SIZE, 1]
-                        if places[i]._equals(fluid.CPUPlace()):
-                            assert image._place()._equals(fluid.CPUPlace())
-                            assert label._place()._equals(fluid.CPUPlace())
-                        else:
-                            assert image._place()._equals(fluid.CUDAPinnedPlace(
-                            ))
-                            assert label._place()._equals(fluid.CUDAPinnedPlace(
-                            ))
+                        assert image._place()._equals(places[i])
+                        assert label._place()._equals(places[i])
                     L, = exe.run(program=prog,
                                  feed=d,
                                  fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..19da09a463f3cc6224a22eb90278abae9ec59b91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -0,0 +1,556 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None,
+                               align_corners=True,
+                               data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((n, c, out_h, out_w))
+
+    if align_corners:
+        for i in range(out_h):
+            in_i = int(ratio_h * i + 0.5)
+            for j in range(out_w):
+                in_j = int(ratio_w * j + 0.5)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+    else:
+        for i in range(out_h):
+            in_i = int(ratio_h * i)
+            for j in range(out_w):
+                in_j = int(ratio_w * j)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(X.dtype)
+
+
+class TestNearestInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
+            self.align_corners, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpSame(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 4, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 8]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
+
+
+class TestNearestInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 80
+        self.out_w = 40
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+    def set_align_corners(self):
+        self.align_corners = False
+
+
+class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 5, 7]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.5
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 3.0]
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float64")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 5, 4, 4]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a tensor list
+class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+class TestNearestAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_nearest(
+            y, out_shape=[12, 12], data_format='NHWC')
+        out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_nearest(
+            x, out_shape=[4, 4], actual_shape=actual_size)
+        out5 = fluid.layers.resize_nearest(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = nearest_neighbor_interp_np(
+            x_data, out_h=12, out_w=12, align_corners=True)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+
+
+class TestUpsampleNearest2dInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph
+        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
+        upsample = paddle.nn.UpsamplingNearest2d(scale_factor=[2, 2])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = upsample(x)
+            expect = nearest_neighbor_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestNearestInterpException(unittest.TestCase):
+    def test_exception(self):
+        input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
+
+        def attr_data_format():
+            # for 4-D input, data_format can only be NCHW or NHWC
+            out = fluid.layers.resize_nearest(
+                input, out_shape=[4, 8], data_format='NDHWC')
+
+        def attr_scale_type():
+            out = fluid.layers.resize_nearest(input, scale='scale')
+
+        def attr_scale_value():
+            out = fluid.layers.resize_nearest(input, scale=-0.3)
+
+        self.assertRaises(ValueError, attr_data_format)
+        self.assertRaises(TypeError, attr_scale_type)
+        self.assertRaises(ValueError, attr_scale_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index b14e3a15d979c6f66c2ffeeeec6536d5a8ab3b47..e7154193beaf788a9d20f3c131b1df3420918266 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -445,7 +445,6 @@ class TestNLLLoss(unittest.TestCase):
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
                 name='input', shape=[5, 3, 5, 5], dtype='float64')
@@ -879,5 +878,93 @@ class TestNLLLossOp2DNoReduce(OpTest):
         self.label_shape = [5, 5, 5]
 
 
+class TestNLLLossName(unittest.TestCase):
+    def test_name(self):
+        prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        place = paddle.CPUPlace()
+        with paddle.static.program_guard(prog, startup_prog):
+            x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+            label = paddle.data(name='label', shape=[10], dtype='int64')
+            nll_loss = paddle.nn.loss.NLLLoss(name='nll_loss')
+            res = nll_loss(x, label)
+            self.assertTrue(res.name.startswith('nll_loss'))
+
+
+class TestNLLLossInvalidArgs(unittest.TestCase):
+    def test_x_dim_value_error(self):
+        def test_x_dim_lt_2():
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                x = paddle.data(name='x', shape=[10, ], dtype='float64')
+                label = paddle.data(name='label', shape=[10, ], dtype='float64')
+                nll_loss = paddle.nn.loss.NLLLoss()
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError, test_x_dim_lt_2)
+
+        def test_x_dim_imperative_lt_2():
+            with fluid.dygraph.guard():
+                x_np = np.random.random(size=(5, )).astype(np.float64)
+                label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64)
+                x = paddle.to_variable(x_np)
+                label = paddle.to_variable(label_np)
+                nll_loss = paddle.nn.loss.NLLLoss()
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError, test_x_dim_imperative_lt_2)
+
+    def test_reduction_value_error(self):
+        def test_NLLLoss_reduction_not_sum_mean_none():
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                label = paddle.data(name='label', shape=[10], dtype='int64')
+                nll_loss = paddle.nn.loss.NLLLoss(reduction='')
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError, test_NLLLoss_reduction_not_sum_mean_none)
+
+        def test_NLLLoss_reduction_imperative_not_sum_mean_none():
+            with fluid.dygraph.guard():
+                x_np = np.random.random(size=(5, 3)).astype(np.float64)
+                label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
+                x = paddle.to_variable(x_np)
+                label = paddle.to_variable(label_np)
+                nll_loss = paddle.nn.loss.NLLLoss(reduction='')
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError,
+                          test_NLLLoss_reduction_imperative_not_sum_mean_none)
+
+        def test_nll_loss_function_reduction_not_sum_mean_none():
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                label = paddle.data(name='label', shape=[10], dtype='int64')
+                res = paddle.nn.functional.nll_loss(x, label, reduction='')
+
+        self.assertRaises(ValueError,
+                          test_nll_loss_function_reduction_not_sum_mean_none)
+
+        def test_nll_loss_function_reduction_imperative_not_sum_mean_none():
+            with fluid.dygraph.guard():
+                x_np = np.random.random(size=(5, 3)).astype(np.float64)
+                label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
+                x = paddle.to_variable(x_np)
+                label = paddle.to_variable(label_np)
+                res = paddle.nn.functional.nll_loss(x, label, reduction='')
+
+        self.assertRaises(
+            ValueError,
+            test_nll_loss_function_reduction_imperative_not_sum_mean_none)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..339f689998f817054611bd85b11945b61d1f649b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.nn.functional as functional
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_attr(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_exception(unittest.TestCase):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
+        self.x.set(data, self.place)
+        self.x.set_recursive_sequence_lengths(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(core.EnforceNotMet, run)
+
+
+class TestOneHotOpApi(unittest.TestCase):
+    def test_api(self):
+        num_classes = 10
+        self._run(num_classes)
+
+    def test_api_with_depthTensor(self):
+        num_classes = fluid.layers.assign(input=np.array([10], dtype=np.int32))
+        self._run(num_classes)
+
+    def test_api_with_dygraph(self):
+        num_classes = 10
+        label = np.array(
+            [np.random.randint(0, num_classes - 1)
+             for i in range(6)]).reshape([6, 1])
+        with fluid.dygraph.guard():
+            one_hot_label = functional.one_hot(
+                x=fluid.dygraph.to_variable(label), num_classes=num_classes)
+
+    def _run(self, num_classes):
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        one_hot_label = functional.one_hot(x=label, num_classes=num_classes)
+
+        place = fluid.CPUPlace()
+        label_data = np.array([np.random.randint(0, 10 - 1)
+                               for i in range(6)]).reshape([6, 1])
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'label': label_data, },
+                      fetch_list=[one_hot_label],
+                      return_numpy=False)
+
+
+class BadInputTestOnehotV2(unittest.TestCase):
+    def test_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def test_bad_x():
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="float32")
+                one_hot_label = functional.one_hot(x=label, num_classes=4)
+
+            self.assertRaises(TypeError, test_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebe769fb9bce1aee8412ccebc216c2c85e97775
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.static import Program, program_guard
+
+
+def calc_margin_rank_loss(x, y, label, margin=0.0, reduction='none'):
+    result = (-1 * label) * (x - y) + margin
+    result = np.maximum(result, 0)
+    if reduction == 'none':
+        return result
+    elif reduction == 'sum':
+        return np.sum(result)
+    elif reduction == 'mean':
+        return np.mean(result)
+
+
+def create_test_case(margin, reduction):
+    class MarginRankingLossCls(unittest.TestCase):
+        def setUp(self):
+            self.x_data = np.random.rand(10, 10).astype("float64")
+            self.y_data = np.random.rand(10, 10).astype("float64")
+            self.label_data = np.random.choice(
+                [-1, 1], size=[10, 10]).astype("float64")
+            self.places = []
+            self.places.append(fluid.CPUPlace())
+            if core.is_compiled_with_cuda():
+                self.places.append(paddle.CUDAPlace(0))
+
+        def run_static_functional_api(self, place):
+            paddle.enable_static()
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name="x", shape=[10, 10], dtype="float64")
+                y = paddle.static.data(
+                    name="y", shape=[10, 10], dtype="float64")
+                label = paddle.static.data(
+                    name="label", shape=[10, 10], dtype="float64")
+                result = paddle.nn.functional.margin_ranking_loss(
+                    x, y, label, margin, reduction)
+                exe = paddle.static.Executor(place)
+                result_numpy, = exe.run(feed={
+                    "x": self.x_data,
+                    "y": self.y_data,
+                    "label": self.label_data
+                },
+                                        fetch_list=[result])
+                self.assertTrue(np.allclose(result_numpy, expected))
+
+        def run_static_api(self, place):
+            paddle.enable_static()
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name="x", shape=[10, 10], dtype="float64")
+                y = paddle.static.data(
+                    name="y", shape=[10, 10], dtype="float64")
+                label = paddle.static.data(
+                    name="label", shape=[10, 10], dtype="float64")
+                margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                    margin=margin, reduction=reduction)
+                result = margin_rank_loss(x, y, label)
+                exe = paddle.static.Executor(place)
+                result_numpy, = exe.run(feed={
+                    "x": self.x_data,
+                    "y": self.y_data,
+                    "label": self.label_data
+                },
+                                        fetch_list=[result])
+                self.assertTrue(np.allclose(result_numpy, expected))
+                self.assertTrue('loss' in result.name)
+
+        def run_dynamic_functional_api(self, place):
+            paddle.disable_static(place)
+            x = paddle.to_variable(self.x_data)
+            y = paddle.to_variable(self.y_data)
+            label = paddle.to_variable(self.label_data)
+
+            result = paddle.nn.functional.margin_ranking_loss(x, y, label,
+                                                              margin, reduction)
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            self.assertTrue(np.allclose(result.numpy(), expected))
+
+        def run_dynamic_api(self, place):
+            paddle.disable_static(place)
+            x = paddle.to_variable(self.x_data)
+            y = paddle.to_variable(self.y_data)
+            label = paddle.to_variable(self.label_data)
+            margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                margin=margin, reduction=reduction)
+            result = margin_rank_loss(x, y, label)
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            self.assertTrue(np.allclose(result.numpy(), expected))
+
+        def run_dynamic_broadcast_api(self, place):
+            paddle.disable_static(place)
+            label_data = np.random.choice([-1, 1], size=[10]).astype("float64")
+            x = paddle.to_variable(self.x_data)
+            y = paddle.to_variable(self.y_data)
+            label = paddle.to_variable(label_data)
+            margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                margin=margin, reduction=reduction)
+            result = margin_rank_loss(x, y, label)
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                label_data,
+                margin=margin,
+                reduction=reduction)
+            self.assertTrue(np.allclose(result.numpy(), expected))
+
+        def test_case(self):
+            for place in self.places:
+                self.run_static_api(place)
+                self.run_static_functional_api(place)
+                self.run_dynamic_api(place)
+                self.run_dynamic_functional_api(place)
+                self.run_dynamic_broadcast_api(place)
+
+    cls_name = "TestMarginRankLossCase_{}_{}".format(margin, reduction)
+    MarginRankingLossCls.__name__ = cls_name
+    globals()[cls_name] = MarginRankingLossCls
+
+
+for margin in [0.0, 0.2]:
+    for reduction in ['none', 'mean', 'sum']:
+        create_test_case(margin, reduction)
+
+
+# test case the raise message
+class MarginRakingLossError(unittest.TestCase):
+    paddle.enable_static()
+
+    def test_errors(self):
+        def test_margin_value_error():
+            margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                margin=0.1, reduction="reduce_mean")
+
+        self.assertRaises(ValueError, test_margin_value_error)
+
+        def test_functional_margin_value_error():
+            x = paddle.static.data(name="x", shape=[10, 10], dtype="float64")
+            y = paddle.static.data(name="y", shape=[10, 10], dtype="float64")
+            label = paddle.static.data(
+                name="label", shape=[10, 10], dtype="float64")
+            result = paddle.nn.functional.margin_ranking_loss(
+                x, y, label, margin=0.1, reduction="reduction_mean")
+
+        self.assertRaises(ValueError, test_functional_margin_value_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52a1f5d5b16ca7e0d58230a1a17624e5bff0b02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit, erf
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as functional
+
+
+class TestNNSigmoidAPI(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+
+    def init_data(self):
+        self.x_shape = [10, 15]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.y = self.ref_forward(self.x)
+
+    def ref_forward(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def ref_backward(self, y, dy):
+        return dy * y * (1 - y)
+
+    def check_static_api(self, place):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        mysigmoid = nn.Sigmoid(name="api_sigmoid")
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data(name='x', shape=self.x_shape)
+            x.stop_gradient = False
+            y = mysigmoid(x)
+            fluid.backward.append_backward(paddle.mean(y))
+        exe = paddle.static.Executor(place)
+        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], self.y))
+        self.assertTrue(y.name.startswith("api_sigmoid"))
+
+    def check_dynamic_api(self, place):
+        paddle.disable_static(place)
+        x = paddle.to_variable(self.x)
+        mysigmoid = nn.Sigmoid()
+        y = mysigmoid(x)
+        self.assertTrue(np.allclose(y.numpy(), self.y))
+
+    def test_check_api(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_dynamic_api(place)
+            self.check_static_api(place)
+
+
+class TestNNFunctionalSigmoidAPI(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+
+    def init_data(self):
+        self.x_shape = [10, 15]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.y = self.ref_forward(self.x)
+
+    def ref_forward(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def check_static_api(self, place):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data(name='x', shape=self.x_shape)
+            y = functional.sigmoid(x, name="api_sigmoid")
+        exe = paddle.static.Executor(fluid.CPUPlace())
+        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], self.y))
+
+    def check_dynamic_api(self):
+        paddle.disable_static()
+        x = paddle.to_variable(self.x)
+        y = functional.sigmoid(x)
+        self.assertTrue(np.allclose(y.numpy(), self.y))
+
+    def test_check_api(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_static_api(place)
+            self.check_dynamic_api()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index e6b7a3e7603f53d78052d5de309d6ed7d84c4660..c047cf6ddff78641b918de75a284574175bb3bca 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -22,17 +22,48 @@ import paddle.fluid as fluid
 
 
 def p_norm(x, axis, porder, keepdims=False):
-    if axis is None: axis = -1
-    xp = np.power(np.abs(x), porder)
-    s = np.sum(xp, axis=axis, keepdims=keepdims)
-    r = np.power(s, 1.0 / porder)
+    r = []
+    if axis is None:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x))
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x))
+        else:
+            r = np.linalg.norm(x, ord=porder)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(
+            x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
+
     return r
 
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
     if axis is None: axis = (-2, -1)
-    r = np.linalg.norm(x, ord='fro', axis=axis, keepdims=keepdims)
+    r = np.linalg.norm(
+        x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
 
 
@@ -89,6 +120,7 @@ class TestPnormOp(OpTest):
             'porder': float(self.porder)
         }
         self.outputs = {'Out': norm}
+        self.gradient = self.calc_gradient()
 
     def test_check_output(self):
         self.check_output()
@@ -104,6 +136,34 @@ class TestPnormOp(OpTest):
         self.keepdim = False
         self.dtype = "float64"
 
+    def calc_gradient(self):
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder)
+        }
+        x = self.inputs["X"]
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            grad = np.power(norm, 1 - porder) * np.power(
+                np.abs(x), porder - 1) * np.sign(x)
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        numel /= x.shape[axis]
+        return [grad.astype(x.dtype) * 1 / numel]
+
 
 class TestPnormOp2(TestPnormOp):
     def init_test_case(self):
@@ -118,22 +178,49 @@ class TestPnormOp2(TestPnormOp):
         self.check_grad(['X'], 'Out')
 
 
-def run_out(self, p, axis, shape_x, shape_y, dtype):
-    with fluid.program_guard(fluid.Program()):
-        data1 = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        data2 = fluid.data(name="Y", shape=shape_y, dtype=dtype)
-        out = paddle.norm(input=data1, p=p, axis=axis, out=data2)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        result = exe.run(feed={"X": np.random.rand(*shape_x).astype(dtype)},
-                         fetch_list=[data2, out])
-        self.assertEqual((result[0] == result[1]).all(), True)
+class TestPnormOp3(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = np.inf
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+class TestPnormOp4(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = -np.inf
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+class TestPnormOp5(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 0
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
 def run_fro(self, p, axis, shape_x, dtype):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(input=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
@@ -145,31 +232,72 @@ def run_fro(self, p, axis, shape_x, dtype):
 def run_pnorm(self, p, axis, shape_x, dtype):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(input=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
         expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+
+
+def run_graph(self, p, axis, shape_x, dtype):
+    paddle.disable_static()
+    shape = [2, 3, 4]
+    np_input = np.arange(24).astype('float32') - 12
+    np_input = np_input.reshape(shape)
+    x = paddle.to_tensor(np_input)
+    #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
+    # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+    out_pnorm = paddle.norm(x, p=2, axis=-1)
+
+    # compute frobenius norm along last two dimensions.
+    out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=[0, 1])
+    # compute 2-order  norm along [0,1] dimension.
+    out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
+    out_pnorm = paddle.norm(x, p=2)
+    #out_pnorm = [17.43559577 16.91153453 16.73320053 16.91153453]
+    # compute inf-order  norm
+    out_pnorm = paddle.norm(x, p=np.inf)
+    #out_pnorm = [12.]
+    out_pnorm = paddle.norm(x, p=np.inf, axis=0)
+    #out_pnorm = [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
+
+    # compute -inf-order  norm
+    out_pnorm = paddle.norm(x, p=-np.inf)
+    #out_pnorm = [0.]
+    out_pnorm = paddle.norm(x, p=-np.inf, axis=0)
+    # out_fro = [17.43559577 16.91153453 16.73320053 16.91153453]
+    paddle.enable_static()
 
 
 class API_NormTest(unittest.TestCase):
-    def test_output_result(self):
-        run_out(self, p=2, axis=1, shape_x=[3, 4], shape_y=[3], dtype="float32")
-        run_out(
-            self,
-            p='fro',
-            axis=None,
-            shape_x=[3, 4],
-            shape_y=[1],
-            dtype="float32")
-
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[3, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[3, 3, 4], dtype="float64")
+        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
+        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
         run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
         run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
+        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
+        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
+
+        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+
+    def test_dygraph(self):
+        run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
 
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
@@ -197,11 +325,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-2, -1])
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-2, -1])
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d9af4d50be77bd1d2ecc11dd872ef612209f1e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -0,0 +1,197 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import copy
+
+np.random.seed(10)
+
+
+class TestNormalAPI(unittest.TestCase):
+    def setUp(self):
+        self.mean = 1.0
+        self.std = 0.0
+        self.shape = None
+        self.repeat_num = 1000
+        self.set_attrs()
+        self.dtype = self.get_dtype()
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        self.shape = [8, 12]
+
+    def get_shape(self):
+        if isinstance(self.mean, np.ndarray):
+            shape = self.mean.shape
+        elif isinstance(self.std, np.ndarray):
+            shape = self.std.shape
+        else:
+            shape = self.shape
+        return list(shape)
+
+    def get_dtype(self):
+        if isinstance(self.mean, np.ndarray):
+            return self.mean.dtype
+        elif isinstance(self.std, np.ndarray):
+            return self.std.dtype
+        else:
+            return 'float32'
+
+    def static_api(self):
+        shape = self.get_shape()
+        ret_all_shape = copy.deepcopy(shape)
+        ret_all_shape.insert(0, self.repeat_num)
+        ret_all = np.zeros(ret_all_shape, self.dtype)
+        if isinstance(self.mean, np.ndarray) \
+            and isinstance(self.std, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                out = paddle.normal(mean, std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={
+                        'Mean': self.mean,
+                        'Std': self.std.reshape(shape)
+                    },
+                                  fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        elif isinstance(self.mean, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                out = paddle.normal(mean, self.std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={'Mean': self.mean}, fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        elif isinstance(self.std, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                out = paddle.normal(self.mean, std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={'Std': self.std}, fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        else:
+            with paddle.static.program_guard(paddle.static.Program()):
+                out = paddle.normal(self.mean, self.std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+
+    def dygraph_api(self):
+        paddle.disable_static(self.place)
+        shape = self.get_shape()
+        ret_all_shape = copy.deepcopy(shape)
+        ret_all_shape.insert(0, self.repeat_num)
+        ret_all = np.zeros(ret_all_shape, self.dtype)
+
+        mean = paddle.to_tensor(self.mean) \
+            if isinstance(self.mean, np.ndarray) else self.mean
+        std = paddle.to_tensor(self.std) \
+            if isinstance(self.std, np.ndarray) else self.std
+        for i in range(self.repeat_num):
+            out = paddle.normal(mean, std, self.shape)
+            ret_all[i] = out.numpy()
+        paddle.enable_static()
+        return ret_all
+
+    def test_api(self):
+        ret_static = self.static_api()
+        ret_dygraph = self.dygraph_api()
+        for ret in [ret_static, ret_dygraph]:
+            shape_ref = self.get_shape()
+            self.assertEqual(shape_ref, list(ret[0].shape))
+
+            ret = ret.flatten().reshape([self.repeat_num, -1])
+            mean = np.mean(ret, axis=0)
+            std = np.std(ret, axis=0)
+            mean_ref=self.mean.reshape([1, -1]) \
+                if isinstance(self.mean, np.ndarray) else self.mean
+            std_ref=self.std.reshape([1, -1]) \
+                if isinstance(self.std, np.ndarray) else self.std
+            self.assertTrue(np.allclose(mean_ref, mean, 0.1, 0.1))
+            self.assertTrue(np.allclose(std_ref, std, 0.1, 0.1))
+
+
+class TestNormalAPI_mean_is_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(-2, -1, [2, 3, 4, 5]).astype('float64')
+
+
+class TestNormalAPI_std_is_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.std = np.random.uniform(0.7, 1, [2, 3, 17]).astype('float64')
+
+
+class TestNormalAPI_mean_std_are_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(1, 2, [1, 100]).astype('float64')
+        self.std = np.random.uniform(0.5, 1, [1, 100]).astype('float64')
+
+
+class TestNormalAPI_mean_std_are_tensor_with_different_dtype(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(1, 2, [100]).astype('float64')
+        self.std = np.random.uniform(1, 2, [100]).astype('float32')
+
+
+class TestNormalAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        shape = [1, 2, 3]
+        out1 = paddle.normal(shape=shape)
+        out2 = paddle.tensor.normal(shape=shape)
+        out3 = paddle.tensor.random.normal(shape=shape)
+        paddle.enable_static()
+
+
+class TestNormalErrors(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            mean = [1, 2, 3]
+            self.assertRaises(TypeError, paddle.normal, mean)
+
+            std = [1, 2, 3]
+            self.assertRaises(TypeError, paddle.normal, std=std)
+
+            mean = paddle.data('Mean', [100], 'int32')
+            self.assertRaises(TypeError, paddle.normal, mean)
+
+            std = paddle.data('Std', [100], 'int32')
+            self.assertRaises(TypeError, paddle.normal, mean=1.0, std=std)
+
+            self.assertRaises(TypeError, paddle.normal, shape=1)
+
+            self.assertRaises(TypeError, paddle.normal, shape=[1.0])
+
+            shape = paddle.data('Shape', [100], 'float32')
+            self.assertRaises(TypeError, paddle.normal, shape=shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..614e0e897613b235e2ec6fa72cfaf1057e7d5bbd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+
+
+def p_normalize(x, axis=1, p=2, epsilon=1e-12, keepdims=True):
+    xp = np.power(np.abs(x), p)
+    s = np.sum(xp, axis=axis, keepdims=keepdims)
+    r = np.maximum(np.power(s, 1.0 / p), epsilon)
+    return x / r
+
+
+class TestNNFunctionalNormalize(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
+        self.input_np2 = np.array([0.0, 0.0]).astype(np.float32)
+        self.expected0 = p_normalize(self.input_np)
+        self.expected1 = p_normalize(self.input_np, p=1.5)
+        self.expected2 = p_normalize(self.input_np, axis=0)
+        self.expected3 = p_normalize(self.input_np2, axis=0)
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = F.normalize(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = F.normalize(x, p=1.5)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = F.normalize(x, axis=0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = F.normalize(x, axis=0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        self.assertRaises(BaseException, F.normalize, x)
+
+    def run_static(self, use_gpu=False):
+        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.data(name='input2', shape=[2], dtype='float32')
+        result0 = F.normalize(x)
+        result1 = F.normalize(x, p=1.5)
+        result2 = F.normalize(x, axis=0)
+        result3 = F.normalize(x, name='aaa')
+        result4 = F.normalize(x2, axis=0)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "input2": self.input_np2},
+            fetch_list=[result0, result1, result2, result4])
+
+        self.assertTrue(np.allclose(static_result[0], self.expected0))
+        self.assertTrue(np.allclose(static_result[1], self.expected1))
+        self.assertTrue(np.allclose(static_result[2], self.expected2))
+        self.assertTrue('aaa' in result3.name)
+        self.assertTrue(np.allclose(static_result[3], self.expected3))
+        self.assertRaises(ValueError, F.normalize, x2)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8512bc99e7451c73e5513b834fb6aa448717c646
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import functools
+import paddle
+
+
+class TestNumelOp(OpTest):
+    def setUp(self):
+        self.op_type = "size"
+        self.init()
+        x = np.random.random((self.shape)).astype("float64")
+        self.inputs = {'Input': x, }
+        self.outputs = {'Out': np.array([np.size(x)])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init(self):
+        self.shape = (6, 56, 8, 55)
+
+
+class TestNumelOp1(TestNumelOp):
+    def init(self):
+        self.shape = (11, 66)
+
+
+class TestNumelOp2(TestNumelOp):
+    def init(self):
+        self.shape = (0, )
+
+
+class TestNumelOoAPI(unittest.TestCase):
+    def test_numel_static(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            shape1 = [2, 1, 4, 5]
+            shape2 = [1, 4, 5]
+            x_1 = paddle.data(shape=shape1, dtype='int32', name='x_1')
+            x_2 = paddle.data(shape=shape2, dtype='int32', name='x_2')
+            input_1 = np.random.random(shape1).astype("int32")
+            input_2 = np.random.random(shape2).astype("int32")
+            out_1 = paddle.numel(x_1)
+            out_2 = paddle.numel(x_2)
+            exe = paddle.static.Executor(place=paddle.CPUPlace())
+            res_1, res_2 = exe.run(feed={
+                "x_1": input_1,
+                "x_2": input_2,
+            },
+                                   fetch_list=[out_1, out_2])
+            assert (np.array_equal(
+                res_1, np.array([np.size(input_1)]).astype("int64")))
+            assert (np.array_equal(
+                res_2, np.array([np.size(input_2)]).astype("int64")))
+
+    def test_numel_imperative(self):
+        paddle.disable_static(paddle.CPUPlace())
+        input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
+        input_2 = np.random.random([1, 4, 5]).astype("int32")
+        x_1 = paddle.to_variable(input_1)
+        x_2 = paddle.to_variable(input_2)
+        out_1 = paddle.numel(x_1)
+        out_2 = paddle.numel(x_2)
+        assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
+        assert (np.array_equal(out_2.numpy().item(0), np.size(input_2)))
+        paddle.enable_static()
+
+    def test_error(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+
+            def test_x_type():
+                shape = [1, 4, 5]
+                input_1 = np.random.random(shape).astype("int32")
+                out_1 = paddle.numel(input_1)
+
+            self.assertRaises(TypeError, test_x_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index 4e3b3f3edc9f92a2b268586f79dbcc3aafc05031..c1e6a3377710f98184e9541e287b911def89cd81 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -62,18 +62,18 @@ class TestOnesLikeImpeartive(unittest.TestCase):
         shape = [3, 4]
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with paddle.imperative.guard(place):
-            x = paddle.imperative.to_variable(np.ones(shape))
-            for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
-                out = ones_like(x, dtype)
-                self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(),
-                                 True)
-
-            out = paddle.tensor.ones_like(x)
+        paddle.disable_static(place)
+        x = paddle.to_variable(np.ones(shape))
+        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+            out = ones_like(x, dtype)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
 
-            out = paddle.tensor.creation.ones_like(x)
-            self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
+        out = paddle.tensor.ones_like(x)
+        self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
+
+        out = paddle.tensor.creation.ones_like(x)
+        self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_ones_op.py b/python/paddle/fluid/tests/unittests/test_ones_op.py
index d50e820c6c6bc89a9346382c79f057e179f1da12..47ce37964324208a032c821360d6ab10666abcb5 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_op.py
@@ -27,35 +27,35 @@ import numpy as np
 
 class ApiOnesTest(unittest.TestCase):
     def test_paddle_ones(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10])
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="float32")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10], dtype="float64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
     def test_fluid_ones(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = fluid.layers.ones(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -64,25 +64,25 @@ class ApiOnesTest(unittest.TestCase):
 class ApiOnesZerosError(unittest.TestCase):
     def test_errors(self):
         def test_error1():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = paddle.ones(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error1)
 
         def test_error2():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = paddle.ones(shape=10)
 
         self.assertRaises(TypeError, test_error2)
 
         def test_error3():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = fluid.layers.ones(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error3)
 
         def test_error4():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = fluid.layers.ones(shape=[10], dtype="int8")
 
         self.assertRaises(TypeError, test_error4)
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..11719a9c4a92807375c1fdfcc7e168dccc5e522c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -0,0 +1,713 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+
+from paddle.fluid import Program, program_guard, Executor, default_main_program
+
+
+class TestPad3dOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.value = 0.0
+        self.variable_paddings = False
+        self.initTestCase()
+        self.op_type = "pad3d"
+        self.inputs = {'X': np.random.random(self.shape).astype("float64")}
+        self.attrs = {}
+        if self.variable_paddings:
+            self.attrs['paddings'] = []
+            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        else:
+            self.attrs['paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        self.attrs['value'] = self.value
+        self.attrs['mode'] = self.mode
+        self.attrs['data_format'] = self.data_format
+        if self.data_format == "NCDHW":
+            paddings = [
+                (0, 0),
+                (0, 0),
+                (self.paddings[4], self.paddings[5]),
+                (self.paddings[2], self.paddings[3]),
+                (self.paddings[0], self.paddings[1]),
+            ]
+        else:
+            paddings = [
+                (0, 0),
+                (self.paddings[4], self.paddings[5]),
+                (self.paddings[2], self.paddings[3]),
+                (self.paddings[0], self.paddings[1]),
+                (0, 0),
+            ]
+        if self.mode == "constant":
+            out = np.pad(self.inputs['X'],
+                         paddings,
+                         mode=self.mode,
+                         constant_values=self.value)
+        elif self.mode == "reflect":
+            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
+        elif self.mode == "replicate":
+            out = np.pad(self.inputs['X'], paddings, mode="edge")
+        elif self.mode == "circular":
+            out = np.pad(self.inputs['X'], paddings, mode="wrap")
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 0, 0, 0, 0, 0]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 4, 5]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.value = 1.0
+
+
+class TestCase2(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [1, 1, 1, 1, 1, 1]
+        self.mode = "constant"
+        self.data_format = "NDHWC"
+        self.value = 1.0
+
+
+class TestCase3(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 1, 0, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCDHW"
+
+
+class TestCase4(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [0, 1, 2, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NDHWC"
+
+
+class TestCase5(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 2, 1]
+        self.mode = "replicate"
+        self.data_format = "NCDHW"
+
+
+class TestCase6(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [5, 4, 2, 1, 2, 3]
+        self.mode = "replicate"
+        self.data_format = "NDHWC"
+
+
+class TestCase7(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 2, 1]
+        self.mode = "circular"
+        self.data_format = "NCDHW"
+
+
+class TestCase8(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [0, 1, 2, 1, 2, 3]
+        self.mode = "circular"
+        self.data_format = "NDHWC"
+
+
+class TestPadAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def check_static_result_1(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (1, 2, 3, 4, 5)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "constant"
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result = F.pad(x=x,
+                           pad=pad,
+                           value=value,
+                           mode=mode,
+                           data_format="NCDHW")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result])
+
+            np_out = self._get_numpy_out(input_data, pad, mode, value)
+            self.assertTrue(np.allclose(fetches[0], np_out))
+
+    def check_static_result_2(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 1, 2]
+            mode = "reflect"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def check_static_result_3(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "replicate"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def check_static_result_4(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "circular"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0,
+                       data_format="NCDHW"):
+        if data_format == "NCDHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NDHWC":
+            pad = [
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+        elif data_format == "NCHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NHWC":
+            pad = [
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+        elif data_format == "NCL":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NLC":
+            pad = [
+                (0, 0),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+        elif mode == "circular":
+            out = np.pad(input_data, pad, mode="wrap")
+
+        return out
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result_1(place=place)
+            self.check_static_result_2(place=place)
+            self.check_static_result_3(place=place)
+            self.check_static_result_4(place=place)
+
+    def test_dygraph_1(self):
+        paddle.disable_static()
+
+        input_shape = (1, 2, 3, 4, 5)
+        pad = [1, 2, 1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCDHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NDHWC")
+        tensor_data = paddle.to_tensor(input_data)
+
+        y1 = F.pad(tensor_data,
+                   pad=pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCDHW")
+        y2 = F.pad(tensor_data,
+                   pad=pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NDHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        input_shape = (2, 3, 4, 5)
+        pad = [1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NHWC")
+
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        input_shape = (2, 3, 4, 5)
+        pad = [1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NHWC")
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_3(self):
+        paddle.disable_static()
+
+        input_shape = (3, 4, 5)
+        pad = [3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCL")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NLC")
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCL")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NLC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+
+class TestPad1dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCL"):
+        if data_format == "NCL":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5)
+            pad = [1, 2]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection = nn.ReflectionPad1d(padding=pad)
+            pad_replication = nn.ReplicationPad1d(padding=pad)
+            pad_constant = nn.ConstantPad1d(padding=pad, value=value)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad2dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCHW"):
+        if data_format == "NCHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6)
+            pad = [1, 2, 2, 1]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection = nn.ReflectionPad2d(padding=pad)
+            pad_replication = nn.ReplicationPad2d(padding=pad)
+            pad_constant = nn.ConstantPad2d(padding=pad, value=value)
+            pad_zero = nn.ZeroPad2d(padding=pad)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_zero(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=0, data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad3dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCDHW"):
+        if data_format == "NCDHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6, 7)
+            pad = [1, 2, 2, 1, 1, 0]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_replication = nn.ReplicationPad3d(padding=pad)
+            pad_constant = nn.ConstantPad3d(padding=pad, value=value)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad3dOpError(unittest.TestCase):
+    def test_errors(self):
+        def test_variable():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            F.pad(x=data, paddings=[1, 1, 1, 1, 1, 1])
+
+        def test_reflect_1():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[5, 6, 1, 1, 1, 1], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        def test_reflect_2():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[1, 1, 4, 3, 1, 1], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        def test_reflect_3():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[1, 1, 1, 1, 2, 3], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        self.assertRaises(TypeError, test_variable)
+
+        self.assertRaises(Exception, test_reflect_1)
+
+        self.assertRaises(Exception, test_reflect_2)
+
+        self.assertRaises(Exception, test_reflect_3)
+
+
+class TestPadDataformatError(unittest.TestCase):
+    def test_errors(self):
+        def test_ncl():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 3, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCL")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        def test_nchw():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCHW")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        def test_ncdhw():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 3, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCDHW")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        self.assertRaises(AssertionError, test_ncl)
+
+        self.assertRaises(AssertionError, test_nchw)
+
+        self.assertRaises(AssertionError, test_ncdhw)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 50e587478957a9e5c359d0c8a9d606859f17e994..2ffe523ef6dda18a24813e702a1892c335ba6a68 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -22,7 +22,7 @@ import paddle
 
 def _dygraph_guard_(func):
     def __impl__(*args, **kwargs):
-        if paddle.in_imperative_mode():
+        if paddle.in_dynamic_mode():
             return func(*args, **kwargs)
         else:
             with fluid.dygraph.guard():
@@ -52,17 +52,14 @@ class TestDygraphDoubleGrad(TestCase):
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = self.sort_sum_gradient
-        return paddle.imperative.grad(
+        return paddle.grad(
             outputs=outputs,
             inputs=inputs,
             grad_outputs=grad_outputs,
             no_grad_vars=no_grad_vars,
             retain_graph=retain_graph,
             create_graph=create_graph,
-            allow_unused=allow_unused,
-            backward_strategy=backward_strategy)
+            allow_unused=allow_unused)
 
     @dygraph_guard
     def test_exception(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf0efa6ec2e7edafb8d331423a7b47155283c21
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False):
+    return np.linalg.norm(x - y, ord=p, axis=1, keepdims=keepdim)
+
+
+def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+
+    place = fluid.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda(
+    ) else fluid.CPUPlace()
+
+    with paddle.static.program_guard(prog, startup_prog):
+        x = paddle.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+        y = paddle.data(name='y', shape=y_np.shape, dtype=x_np.dtype)
+        dist = paddle.nn.layer.distance.PairwiseDistance(
+            p=p, epsilon=epsilon, keepdim=keepdim)
+        distance = dist(x, y)
+        exe = paddle.static.Executor(place)
+        static_ret = exe.run(prog,
+                             feed={'x': x_np,
+                                   'y': y_np},
+                             fetch_list=[distance])
+        static_ret = static_ret[0]
+    return static_ret
+
+
+def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
+    paddle.disable_static()
+    x = paddle.to_variable(x_np)
+    y = paddle.to_variable(y_np)
+    dist = paddle.nn.layer.distance.PairwiseDistance(
+        p=p, epsilon=epsilon, keepdim=keepdim)
+    distance = dist(x, y)
+    dygraph_ret = distance.numpy()
+    paddle.enable_static()
+    return dygraph_ret
+
+
+class TestPairwiseDistance(unittest.TestCase):
+    def test_pairwise_distance(self):
+        all_shape = [[100, 100], [4, 5, 6, 7]]
+        dtypes = ['float32', 'float64']
+        keeps = [False, True]
+        for shape in all_shape:
+            for dtype in dtypes:
+                for keepdim in keeps:
+                    x_np = np.random.random(shape).astype(dtype)
+                    y_np = np.random.random(shape).astype(dtype)
+
+                    static_ret = test_static(x_np, y_np, keepdim=keepdim)
+                    dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
+                    excepted_value = pairwise_distance(
+                        x_np, y_np, keepdim=keepdim)
+
+                    self.assertTrue(np.allclose(static_ret, dygraph_ret))
+                    self.assertTrue(np.allclose(static_ret, excepted_value))
+                    self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+
+    def test_pairwise_distance_broadcast(self):
+        shape_x = [100, 100]
+        shape_y = [100, 1]
+        keepdim = False
+        x_np = np.random.random(shape_x).astype('float32')
+        y_np = np.random.random(shape_y).astype('float32')
+        static_ret = test_static(x_np, y_np, keepdim=keepdim)
+        dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
+        excepted_value = pairwise_distance(x_np, y_np, keepdim=keepdim)
+        self.assertTrue(np.allclose(static_ret, dygraph_ret))
+        self.assertTrue(np.allclose(static_ret, excepted_value))
+        self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+
+    def test_pairwise_distance_different_p(self):
+        shape = [100, 100]
+        keepdim = False
+        p = 3.0
+        x_np = np.random.random(shape).astype('float32')
+        y_np = np.random.random(shape).astype('float32')
+        static_ret = test_static(x_np, y_np, p=p, keepdim=keepdim)
+        dygraph_ret = test_dygraph(x_np, y_np, p=p, keepdim=keepdim)
+        excepted_value = pairwise_distance(x_np, y_np, p=p, keepdim=keepdim)
+        self.assertTrue(np.allclose(static_ret, dygraph_ret))
+        self.assertTrue(np.allclose(static_ret, excepted_value))
+        self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf1e9711b74b31e15b732f87addbc9fa653152f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import paddle.fluid as fluid
+
+import os
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestParallelDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_sync_batch_norm.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 0bcb4be3b7fb9380932cf137ac8e4939dcd77288..cf93f39ab8c5c92aa075f2f0a7dca9a5c5d9f485 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -16,16 +16,17 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 
 
-class TestPixelShuffle(OpTest):
-    def setUp(self):
-        self.op_type = "pixel_shuffle"
-        n, c, h, w = 2, 9, 4, 4
-        up_factor = 3
-        shape = [n, c, h, w]
-        x = np.random.random(shape).astype("float64")
+def pixel_shuffle_np(x, up_factor, data_format="NCHW"):
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
         new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
                      w)
         # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
@@ -34,10 +35,42 @@ class TestPixelShuffle(OpTest):
         npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
         oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
         npresult = np.reshape(npresult, oshape)
+        return npresult
+    else:
+        n, h, w, c = x.shape
+        new_shape = (n, h, w, c // (up_factor * up_factor), up_factor,
+                     up_factor)
+        # reshape to (num,h,w,output_channel,upscale_factor,upscale_factor)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,h,upscale_factor,w,upscale_factor,output_channel)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, h * up_factor, w * up_factor, c // (up_factor * up_factor)]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+
+
+class TestPixelShuffleOp(OpTest):
+    def setUp(self):
+        self.op_type = "pixel_shuffle"
+        self.init_data_format()
+        n, c, h, w = 2, 9, 4, 4
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        up_factor = 3
+
+        x = np.random.random(shape).astype("float64")
+        npresult = pixel_shuffle_np(x, up_factor, self.format)
 
         self.inputs = {'X': x}
         self.outputs = {'Out': npresult}
-        self.attrs = {'upscale_factor': up_factor}
+        self.attrs = {'upscale_factor': up_factor, "data_format": self.format}
+
+    def init_data_format(self):
+        self.format = "NCHW"
 
     def test_check_output(self):
         self.check_output()
@@ -46,5 +79,141 @@ class TestPixelShuffle(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestChannelLast(TestPixelShuffleOp):
+    def init_data_format(self):
+        self.format = "NHWC"
+
+
+class TestPixelShuffleAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
+        self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
+        self.out_1_np = pixel_shuffle_np(self.x_1_np, 3)
+        self.out_2_np = pixel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+    def test_static_graph_functional(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            out_1 = F.pixel_shuffle(x_1, 3)
+            out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, self.out_1_np)
+            assert np.allclose(res_2, self.out_2_np)
+
+    # same test between layer and functional in this op.
+    def test_static_graph_layer(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            # init instance
+            ps_1 = paddle.nn.PixelShuffle(3)
+            ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
+            out_1 = ps_1(x_1)
+            out_2 = ps_2(x_2)
+            out_1_np = pixel_shuffle_np(self.x_1_np, 3)
+            out_2_np = pixel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, out_1_np)
+            assert np.allclose(res_2, out_2_np)
+
+    def run_dygraph(self, up_factor, data_format):
+
+        n, c, h, w = 2, 9, 4, 4
+
+        if data_format == "NCHW":
+            shape = [n, c, h, w]
+        if data_format == "NHWC":
+            shape = [n, h, w, c]
+
+        x = np.random.random(shape).astype("float64")
+
+        npresult = pixel_shuffle_np(x, up_factor, data_format)
+
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.disable_static(place=place)
+
+            pixel_shuffle = paddle.nn.PixelShuffle(
+                up_factor, data_format=data_format)
+            result = pixel_shuffle(paddle.to_tensor(x))
+
+            self.assertTrue(np.allclose(result.numpy(), npresult))
+
+            result_functional = F.pixel_shuffle(
+                paddle.to_tensor(x), 3, data_format)
+            self.assertTrue(np.allclose(result_functional.numpy(), npresult))
+
+    def test_dygraph1(self):
+        self.run_dygraph(3, "NCHW")
+
+    def test_dygraph2(self):
+        self.run_dygraph(3, "NHWC")
+
+
+class TestPixelShuffleError(unittest.TestCase):
+    def test_error_functional(self):
+        def error_upscale_factor():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                pixel_shuffle = F.pixel_shuffle(paddle.to_tensor(x), 3.33)
+
+        self.assertRaises(TypeError, error_upscale_factor)
+
+        def error_data_format():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                pixel_shuffle = F.pixel_shuffle(paddle.to_tensor(x), 3, "WOW")
+
+        self.assertRaises(ValueError, error_data_format)
+
+    def test_error_layer(self):
+        def error_upscale_factor_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                ps = paddle.nn.PixelShuffle(3.33)
+
+        self.assertRaises(TypeError, error_upscale_factor_layer)
+
+        def error_data_format_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                ps = paddle.nn.PixelShuffle(3, "MEOW")
+
+        self.assertRaises(ValueError, error_data_format_layer)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1a25ad3529e8b0a4126bc458838ecd876e5af30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0], ceil_mode=False)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=[0])
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool1d_dg = paddle.nn.layer.AvgPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = avg_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.max_pool1d(input, kernel_size=2, stride=2, padding=[0])
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.max_pool1d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_padding_same(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.max_pool1d(
+                input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_padding_same(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(
+                input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool1d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+            self.check_adaptive_avg_static_results(place)
+            self.check_max_dygraph_padding_same(place)
+            self.check_avg_dygraph_padding_same(place)
+
+
+class TestPool2dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[2]]
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[2]]
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.max_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.max_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.avg_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "paddle"
+                res_pd = F.avg_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run7)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a2be6de342efc4e8284e7c352137d0a3a1bcb9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive
+import unittest
+from op_test import OpTest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.nn.functional import avg_pool2d, max_pool2d
+import paddle.fluid as fluid
+import paddle
+
+
+class TestPool2d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            result = max_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=0, return_indices=False)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, indices = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=None,
+                padding="SAME",
+                return_indices=True)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max',
+                padding_algorithm="SAME")
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=None, padding="SAME")
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg',
+                padding_algorithm="SAME")
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
+            result = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                return_indices=False)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_divisor(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
+            result = avg_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=4)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool2d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_max_dygraph_stride_is_none(place)
+            self.check_avg_dygraph_stride_is_none(place)
+            self.check_max_dygraph_padding(place)
+            self.check_avg_divisor(place)
+
+
+class TestPool2dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
+                res_pd = max_pool2d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run3)
+
+        def run3_avg():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run3_avg)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run4)
+
+        def run4_avg():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run4_avg)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=False,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run7)
+
+        def run8():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=False,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index a9fdcd55f74cd53824016765fe82a03190f23f89..a12a328b653b26dac31b11839d02e867a012c4bc 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -517,6 +517,19 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 6, 6]
+
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+
+
 #-------test pool2d with asymmetric padding-----
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc078e9aae7aafe55e937b80270dd012fd64ff70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -0,0 +1,341 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.nn.functional import avg_pool3d, max_pool3d
+from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive
+
+
+class TestPool3d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            result = avg_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg',
+                padding_algorithm="SAME")
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=None, padding="SAME")
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, indices = max_pool3d(
+                input,
+                kernel_size=2,
+                stride=None,
+                padding="SAME",
+                return_indices=True)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max',
+                padding_algorithm="SAME")
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            padding = [0, 0, 0, 0, 0, 0]
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_divisor(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = 0
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=8)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            padding = [0, 0, 0, 0, 0, 0]
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=8)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool3d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_max_dygraph_stride_is_none(place)
+            self.check_max_dygraph_padding(place)
+            self.check_avg_divisor(place)
+
+
+class TestPool3dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NCDHW')
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NDHWC')
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="padding",
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="padding",
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run7)
+
+        def run8():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="VALID",
+                    ceil_mode=True,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run8)
+
+        def run9():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="VALID",
+                    ceil_mode=True,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index ade7e9f50fd27a3bd4084a628eff445e0d81db0d..3d139e9b90c10e352599d506fb3ca837472348f5 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -453,6 +453,18 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 2, 4, 4]
+
+    def init_test_case(self):
+        self.ksize = [2, 2, 3]
+        self.strides = [1, 1, 1]
+
+
 #-------test pool3d with asymmetric padding------
 class TestPool3d_Op_AsyPadding(TestPool3d_Op):
     def init_test_case(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py
new file mode 100755
index 0000000000000000000000000000000000000000..0764cb580e40d115d8703278380a9ced12e24201
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pow.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.tensor as tensor
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+import numpy as np
+import unittest
+
+DYNAMIC = 1
+STATIC = 2
+
+
+def _run_power(mode, x, y):
+    # dynamic mode
+    if mode == DYNAMIC:
+        paddle.disable_static()
+        # y is scalar
+        if isinstance(y, (int, float)):
+            x_ = paddle.to_tensor(x)
+            y_ = y
+            res = paddle.pow(x_, y_)
+            return res.numpy()
+        # y is tensor
+        else:
+            x_ = paddle.to_tensor(x)
+            y_ = paddle.to_tensor(y)
+            res = paddle.pow(x_, y_)
+            return res.numpy()
+    # static mode
+    elif mode == STATIC:
+        paddle.enable_static()
+        # y is scalar
+        if isinstance(y, (int, float)):
+            with program_guard(Program(), Program()):
+                x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
+                y_ = y
+                res = paddle.pow(x_, y_)
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                outs = exe.run(feed={'x': x}, fetch_list=[res])
+                return outs[0]
+        # y is tensor
+        else:
+            with program_guard(Program(), Program()):
+                x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
+                y_ = paddle.static.data(name="y", shape=y.shape, dtype=y.dtype)
+                res = paddle.pow(x_, y_)
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                outs = exe.run(feed={'x': x, 'y': y}, fetch_list=[res])
+                return outs[0]
+
+
+class TestPowerAPI(unittest.TestCase):
+    """TestPowerAPI."""
+
+    def test_power(self):
+        """test_power."""
+        np.random.seed(7)
+        # test 1-d float tensor ** float scalar
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = np.random.rand() * 10
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** int scalar
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = int(np.random.rand() * 10)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = int(np.random.rand() * 10)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** 1-d float tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d float tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int32)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int32)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float32)
+        y = (np.random.rand(*dims) * 10).astype(np.float32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.float32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float32)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test broadcast
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1]) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+
+class TestPowerError(unittest.TestCase):
+    """TestPowerError."""
+
+    def test_errors(self):
+        """test_errors."""
+        np.random.seed(7)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1] + 1) * 10).astype(np.float64)
+        self.assertRaises(fluid.core.EnforceNotMet, _run_power, DYNAMIC, x, y)
+        self.assertRaises(fluid.core.EnforceNotMet, _run_power, STATIC, x, y)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1] + 1) * 10).astype(np.int8)
+        self.assertRaises(TypeError, paddle.pow, x, y)
+
+        # test 1-d float tensor ** int string
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = int(np.random.rand() * 10)
+        self.assertRaises(TypeError, paddle.pow, x, str(y))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 398ad9aa698e5e7cb4102a72955c2d33e6a7e7a9..16388ff8f5f042326ac5705a5f69919f4f8061c2 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -18,23 +18,134 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 import six
-import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.nn.functional as F
+
+
+def ref_prelu(x, weight):
+    x_t = x.copy()
+    weight = weight.reshape(1, -1, 1, 1)
+    neg_indices = x <= 0
+    assert x.shape == neg_indices.shape
+    x_t[neg_indices] = (x_t * weight)[neg_indices]
+    return (x_t, )
+
+
+def ref_prelu_nn(x, num_parameters, init):
+    weight_np = np.full((num_parameters), init)
+    return ref_prelu(x, weight_np)
 
 
-class TestPReluOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
+class TestFunctionalPReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float32')
+        self.weight_np_0 = np.random.randn(1).astype('float32')
+        self.weight_np_1 = np.random.randn(self.x_np.shape[1]).astype('float32')
+
+    def static_check(self, weight_np):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, 'float32')
+            weight = paddle.data('Alpha', weight_np.shape, 'float32')
+            out = F.prelu(x, weight)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np,
+                                'Alpha': weight_np},
+                          fetch_list=[out])
+        out_ref = ref_prelu(self.x_np, weight_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def dygraph_check(self, weight_np):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        weight = paddle.to_tensor(weight_np)
+        out = F.prelu(x, weight)
+        out_ref = ref_prelu(self.x_np, weight_np)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+        paddle.enable_static()
+
+    def test_static_api(self):
+        self.static_check(self.weight_np_0)
+        self.static_check(self.weight_np_1)
+
+    def test_dygraph_api(self):
+        self.dygraph_check(self.weight_np_0)
+        self.dygraph_check(self.weight_np_1)
+
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            weight_fp32 = paddle.data(
+                name='weight_fp32', shape=[1], dtype='float32')
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.prelu, 0.1, 'all')
+            self.assertRaises(TypeError, F.prelu, x=1, weight=weight_fp32)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.prelu, x_int32, 'all')
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.prelu(x_fp16, 'all')
+            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, F.prelu, x=x_int32, weight=weight_fp32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            F.prelu(x=x_fp16, weight=weight_fp32)
+
+
+class TestNNPReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
+
+    def test_static_api(self):
+        startup_program = paddle.static.Program()
+        train_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.data(name='X', shape=self.x_np.shape, dtype='float32')
+            m = paddle.nn.PReLU()
+            out = m(x)
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+            res = exe.run(train_program,
+                          feed={'X': self.x_np},
+                          fetch_list=[out])
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU()
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(num_parameters=self.x_np.shape[1])
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, self.x_np.shape[1], 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(init=0.5)
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.5)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(weight_attr=fluid.ParamAttr(name="weight"))
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(weight_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(0.5)))
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.5)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        paddle.enable_static()
 
 
 class PReluTest(OpTest):
@@ -51,21 +162,22 @@ class PReluTest(OpTest):
         if self.attrs == {'mode': "all"}:
             alpha_np = np.random.uniform(-1, -0.5, (1))
         elif self.attrs == {'mode': "channel"}:
-            alpha_np = np.random.uniform(-1, -0.5, [1, self.x_shape[1]])
+            alpha_np = np.random.uniform(-1, -0.5, [1, self.x_shape[1], 1, 1])
         else:
             alpha_np = np.random.uniform(-1, -0.5, [1] + self.x_shape[1:])
 
         self.inputs = {'X': x_np, 'Alpha': alpha_np}
 
-        # NOTE(zhiqu): reshape inputs['Alpha'] from [1, 100] to [1, 100, 1, 1] since np operands could not be broadcast together with shapes (2,100,3,4) (1,100) 
+        # NOTE(zhiqu): reshape inputs['Alpha'] from [1, 100, 1, 1] to [1, 100] + [1]*len(x.shape[2:])
+        # since np operands could not be broadcast together with shapes (1,100,2,2,2,3) (1,100,1,1) 	
+        reshaped_alpha = self.inputs['Alpha']
         if self.attrs == {'mode': "channel"}:
-            self.inputs['Alpha'] = np.reshape(
+            reshaped_alpha = np.reshape(
                 self.inputs['Alpha'],
                 [1, self.x_shape[1]] + [1] * len(self.x_shape[2:]))
 
         out_np = np.maximum(self.inputs['X'], 0.)
-        out_np = out_np + np.minimum(self.inputs['X'],
-                                     0.) * self.inputs['Alpha']
+        out_np = out_np + np.minimum(self.inputs['X'], 0.) * reshaped_alpha
         assert out_np is not self.inputs['X']
         self.outputs = {'Out': out_np}
 
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..158683907253e2ebc5adab6799c75ffd914df1c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import unittest
+import numpy as np
+
+
+class TestProdOp(unittest.TestCase):
+    def setUp(self):
+        self.input = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_tensor(self.input)
+        dy_result = paddle.prod(input)
+        expected_result = np.prod(self.input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1)
+        expected_result = np.prod(self.input, axis=1)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=-1)
+        expected_result = np.prod(self.input, axis=-1)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=[0, 1])
+        expected_result = np.prod(self.input, axis=(0, 1))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, keepdim=True)
+        expected_result = np.prod(self.input, axis=1, keepdims=True)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, dtype='int64')
+        expected_result = np.prod(self.input, axis=1, dtype=np.int64)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
+        expected_result = np.prod(
+            self.input, axis=1, keepdims=True, dtype=np.int64)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        result0 = paddle.prod(input)
+        result1 = paddle.prod(input, axis=1)
+        result2 = paddle.prod(input, axis=-1)
+        result3 = paddle.prod(input, axis=[0, 1])
+        result4 = paddle.prod(input, axis=1, keepdim=True)
+        result5 = paddle.prod(input, axis=1, dtype='int64')
+        result6 = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        static_result = exe.run(feed={"input": self.input},
+                                fetch_list=[
+                                    result0, result1, result2, result3, result4,
+                                    result5, result6
+                                ])
+
+        expected_result = np.prod(self.input)
+        self.assertTrue(np.allclose(static_result[0], expected_result))
+        expected_result = np.prod(self.input, axis=1)
+        self.assertTrue(np.allclose(static_result[1], expected_result))
+        expected_result = np.prod(self.input, axis=-1)
+        self.assertTrue(np.allclose(static_result[2], expected_result))
+        expected_result = np.prod(self.input, axis=(0, 1))
+        self.assertTrue(np.allclose(static_result[3], expected_result))
+        expected_result = np.prod(self.input, axis=1, keepdims=True)
+        self.assertTrue(np.allclose(static_result[4], expected_result))
+        expected_result = np.prod(self.input, axis=1, dtype=np.int64)
+        self.assertTrue(np.allclose(static_result[5], expected_result))
+        expected_result = np.prod(
+            self.input, axis=1, keepdims=True, dtype=np.int64)
+        self.assertTrue(np.allclose(static_result[6], expected_result))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static(use_gpu=True)
+
+
+class TestProdOpError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[2, 2, 4], dtype='float32')
+            bool_x = paddle.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            # The argument x shoule be a Tensor
+            self.assertRaises(TypeError, paddle.prod, [1])
+
+            # The data type of x should be float32, float64, int32, int64
+            self.assertRaises(TypeError, paddle.prod, bool_x)
+
+            # The argument axis's type shoule be int ,list or tuple
+            self.assertRaises(TypeError, paddle.prod, x, 1.5)
+
+            # The argument dtype of prod_op should be float32, float64, int32 or int64.
+            self.assertRaises(TypeError, paddle.prod, x, 'bool')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/dygraph/backward_strategy.py b/python/paddle/fluid/tests/unittests/test_query_op.py
similarity index 53%
rename from python/paddle/fluid/dygraph/backward_strategy.py
rename to python/paddle/fluid/tests/unittests/test_query_op.py
index bfcf66af31ce13b3394b5b091882b1976f9f003a..fc8ce5ad5f6b89b28fb2ddddd15d5b315fe4c0e4 100644
--- a/python/paddle/fluid/dygraph/backward_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_query_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import unittest
+import paddle
 from paddle.fluid import core
 
-__all__ = ["BackwardStrategy"]
 
-BackwardStrategy = core.BackwardStrategy
+class TestCudnnVersion(unittest.TestCase):
+    def test_no_cudnn(self):
+        cudnn_version = paddle.get_cudnn_version()
+        if not core.is_compiled_with_cuda():
+            self.assertEqual((cudnn_version is None), True)
+        else:
+            self.assertEqual((isinstance(cudnn_version, int)), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 5b2d5be346a9b205cb44373f58a413baa6c8a2fa..88b07f5df83f8f967f8ba76e78b37ecfb2c54276 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -19,7 +19,7 @@ import numpy as np
 from op_test import OpTest
 import paddle
 from paddle.fluid import core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 
 
 def output_hist(out):
@@ -125,14 +125,14 @@ class TestRandintAPI(unittest.TestCase):
             out4 = paddle.randint(
                 low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
             # shape is a tensor and dtype is 'float64'
-            var_shape = paddle.nn.data(
+            var_shape = paddle.static.data(
                 name='var_shape', shape=[2], dtype="int64")
             out5 = paddle.randint(
                 low=1, high=1000, shape=var_shape, dtype='int64')
 
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             outs = exe.run(
                 feed={'var_shape': np.array([100, 100]).astype('int64')},
                 fetch_list=[out1, out2, out3, out4, out5])
@@ -141,13 +141,14 @@ class TestRandintAPI(unittest.TestCase):
 class TestRandintImperative(unittest.TestCase):
     def test_api(self):
         n = 10
-        with paddle.imperative.guard():
-            x1 = paddle.randint(n, shape=[10], dtype="int32")
-            x2 = paddle.tensor.randint(n)
-            x3 = paddle.tensor.random.randint(n)
-            for i in [x1, x2, x3]:
-                for j in i.numpy().tolist():
-                    self.assertTrue((j >= 0 and j < n))
+        paddle.disable_static()
+        x1 = paddle.randint(n, shape=[10], dtype="int32")
+        x2 = paddle.tensor.randint(n)
+        x3 = paddle.tensor.random.randint(n)
+        for i in [x1, x2, x3]:
+            for j in i.numpy().tolist():
+                self.assertTrue((j >= 0 and j < n))
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index f65cc6dc53b7e3541016447d8510bd3d38a53b17..9d2c03f3bba914d8f6b06b54ce0e19c168edb9e3 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 
 
 class TestRandnOp(unittest.TestCase):
@@ -34,12 +34,12 @@ class TestRandnOp(unittest.TestCase):
             dim_2 = paddle.fill_constant([1], "int32", 50)
             x3 = paddle.randn([dim_1, dim_2, 784])
 
-            var_shape = paddle.nn.data('X', [2], 'int32')
+            var_shape = paddle.static.data('X', [2], 'int32')
             x4 = paddle.randn(var_shape)
 
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
-        exe = paddle.Executor(place)
+        exe = paddle.static.Executor(place)
         res = exe.run(train_program,
                       feed={'X': np.array(
                           shape, dtype='int32')},
@@ -55,20 +55,21 @@ class TestRandnOpForDygraph(unittest.TestCase):
         shape = [1000, 784]
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
-        with paddle.imperative.guard(place):
-            x1 = paddle.randn(shape, 'float32')
-            x2 = paddle.randn(shape, 'float64')
+        paddle.disable_static(place)
+        x1 = paddle.randn(shape, 'float32')
+        x2 = paddle.randn(shape, 'float64')
 
-            dim_1 = paddle.fill_constant([1], "int64", 20)
-            dim_2 = paddle.fill_constant([1], "int32", 50)
-            x3 = paddle.randn(shape=[dim_1, dim_2, 784])
+        dim_1 = paddle.fill_constant([1], "int64", 20)
+        dim_2 = paddle.fill_constant([1], "int32", 50)
+        x3 = paddle.randn(shape=[dim_1, dim_2, 784])
 
-            var_shape = paddle.imperative.to_variable(np.array(shape))
-            x4 = paddle.randn(var_shape)
+        var_shape = paddle.to_variable(np.array(shape))
+        x4 = paddle.randn(var_shape)
 
-            for out in [x1, x2, x3, x4]:
-                self.assertAlmostEqual(np.mean(out.numpy()), .0, delta=0.1)
-                self.assertAlmostEqual(np.std(out.numpy()), 1., delta=0.1)
+        for out in [x1, x2, x3, x4]:
+            self.assertAlmostEqual(np.mean(out.numpy()), .0, delta=0.1)
+            self.assertAlmostEqual(np.std(out.numpy()), 1., delta=0.1)
+        paddle.enable_static()
 
 
 class TestRandnOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2933abe46c1b87959c9f61975c02a41e91dfbef3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -0,0 +1,463 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+
+import time  # temp for debug
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TestGeneratorSeed(unittest.TestCase):
+    """
+    Test cases for cpu generator seed.
+    """
+
+    def test_generator_uniform_random_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
+        st1 = gen.get_state()
+        x1 = fluid.layers.uniform_random(
+            [10], dtype="float32", min=0.0, max=1.0)
+        gen.set_state(st1)
+        x2 = fluid.layers.uniform_random(
+            [10], dtype="float32", min=0.0, max=1.0)
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.uniform_random(
+            [10], dtype="float32", min=0.0, max=1.0)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_uniform_random_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.uniform_random(shape=[3, 4])
+            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_gen_dropout_dygraph(self):
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(111111111)
+        st = gen.get_state()
+        # x = np.arange(1,101).reshape(2,50).astype("float32")
+        x = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        y = fluid.layers.dropout(x, 0.5)
+        gen.manual_seed(111111111)
+        #gen.set_state(st)
+        x1 = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        y1 = fluid.layers.dropout(x1, 0.5)
+        y_np = y.numpy()
+        y1_np = y1.numpy()
+        #print(y_np)
+        #print(y1_np)
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> dropout dygraph >>>>>>>")
+            self.assertTrue(np.allclose(y_np, y1_np))
+
+    def test_gen_dropout_static(self):
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x_1 = fluid.layers.uniform_random(shape=[2, 10])
+            y_1 = fluid.layers.dropout(x_1, 0.5)
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program, feed={}, fetch_list=[y_1])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program, feed={}, fetch_list=[y_1])
+        out1_np = np.array(out1[0])
+        out2_np = np.array(out2[0])
+        # print(out1_np)
+        # print(out2_np)
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> dropout static >>>>>>>")
+            self.assertTrue(np.allclose(out1_np, out2_np))
+
+    def test_generator_gaussian_random_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([10], dtype="float32")
+        st1 = gen.get_state()
+        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.set_state(st1)
+        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> gaussian random dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_gaussian_random_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.gaussian_random(shape=[3, 4])
+            result_2 = fluid.layers.gaussian_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> gaussian random static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = paddle.randint(low=10, shape=[10], dtype="int32")
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.set_state(st1)
+        x2 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.manual_seed(12312321111)
+        x3 = paddle.randint(low=10, shape=[10], dtype="int32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> randint dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_ranint_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = paddle.randint(low=10, shape=[3, 4])
+            result_2 = paddle.randint(low=10, shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> randint static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randperm_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = paddle.randperm(10)
+        st1 = gen.get_state()
+        x1 = paddle.randperm(10)
+        gen.set_state(st1)
+        x2 = paddle.randperm(10)
+        gen.manual_seed(12312321111)
+        x3 = paddle.randperm(10)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        # print("## {}".format(x1_np))
+        # print("## {}".format(x2_np))
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> randperm dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randperm_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = paddle.randperm(10)
+            result_2 = paddle.randperm(10)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> randperm static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_sampling_id_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y = fluid.layers.sampling_id(x)
+        st1 = gen.get_state()
+        x1 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y1 = fluid.layers.sampling_id(x)
+        gen.set_state(st1)
+        x2 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y2 = fluid.layers.sampling_id(x)
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y3 = fluid.layers.sampling_id(x)
+
+        x_np = y.numpy()
+        x1_np = y1.numpy()
+        x2_np = y2.numpy()
+        x3_np = y3.numpy()
+
+        print("## {}".format(x1_np))
+        print("## {}".format(x2_np))
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randperm_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[10, 10])
+            result_1 = fluid.layers.sampling_id(x)
+            result_2 = fluid.layers.sampling_id(x)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> sampling id static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_gen_TruncatedNormal_initializer(self):
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+        cur_state = gen.get_state()
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[2, 10])
+            result_1 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+            result_2 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+
+        #gen.set_state(cur_state)    
+        gen.manual_seed(123123143)
+        with fluid.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        print(out1_res1)
+        print(out1_res2)
+        print(out2_res1)
+        print(out2_res2)
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id static >>>>>>>")
+            self.assertTrue(np.allclose(out1_res1, out2_res1))
+            self.assertTrue(np.allclose(out1_res2, out2_res2))
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 6938b8ef1e051777c867796062e5e7cbed6d7fa4..4361a45f1568f5f047ee03090bd3ef28a8d6654f 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -17,7 +17,7 @@ import numpy as np
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 
 
 def check_randperm_out(n, data_np):
@@ -108,7 +108,7 @@ class TestRandpermAPI(unittest.TestCase):
             x1 = paddle.randperm(n)
             x2 = paddle.randperm(n, 'float32')
 
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             res = exe.run(fetch_list=[x1, x2])
 
             self.assertEqual(res[0].dtype, np.int64)
@@ -119,13 +119,14 @@ class TestRandpermAPI(unittest.TestCase):
 
 class TestRandpermImperative(unittest.TestCase):
     def test_out(self):
-        with paddle.imperative.guard():
-            n = 10
-            for dtype in ['int32', np.int64, 'float32', 'float64']:
-                data_p = paddle.randperm(n, dtype)
-                data_np = data_p.numpy()
-                self.assertTrue(
-                    check_randperm_out(n, data_np), msg=error_msg(data_np))
+        paddle.disable_static()
+        n = 10
+        for dtype in ['int32', np.int64, 'float32', 'float64']:
+            data_p = paddle.randperm(n, dtype)
+            data_np = data_p.numpy()
+            self.assertTrue(
+                check_randperm_out(n, data_np), msg=error_msg(data_np))
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 16874d80112bbd537718b20fd4dec3701ea3b75d..cf35f9dbcdaaae1357ccdfd6b5cba85ac98d2037 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,22 +67,6 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestMeanOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -318,21 +302,6 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
-## reduction in multi dims
-class TestReduceMeanOpMultiAxises(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1, 2]}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -420,40 +389,6 @@ class TestReduceSumWithNumelOne(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestReduceMeanWithDimOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((100, 1, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': False}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=False)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceMeanWithNumelOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((100, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=True)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 class TestReduceAll(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -536,18 +471,6 @@ class TestReduceSumOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.reduce_sum, x2)
 
 
-class TestReduceMeanOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of reduce_mean_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.reduce_mean, x1)
-            # The input dtype of reduce_mean_op  must be float32 or float64 or int32 or int64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
-            self.assertRaises(TypeError, fluid.layers.reduce_mean, x2)
-
-
 class API_TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_dtype1():
@@ -580,10 +503,10 @@ class API_TestSumOpError(unittest.TestCase):
 
 
 class API_TestSumOp(unittest.TestCase):
-    def test_1(self):
+    def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_sum = paddle.sum(input=data, dim=1, dtype="float64")
+            result_sum = paddle.sum(x=data, axis=1, dtype="float64")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.rand(10, 10).astype(np.float32)
@@ -593,7 +516,7 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1, dtype="int64")
+            result_sum = paddle.sum(x=data, axis=1, dtype="int64")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
@@ -603,7 +526,7 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1)
+            result_sum = paddle.sum(x=data, axis=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
@@ -612,84 +535,41 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1)
+            result_sum = paddle.sum(x=data, axis=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
             res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
         self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
 
-        with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            z = paddle.sum(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.sum(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
-
-
-class API_TestMaxOp(unittest.TestCase):
-    def test_1(self):
-        # type: float
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_max = paddle.max(input=data, dim=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
-        self.assertEqual((res == np.max(input_data, axis=1)).all(), True)
-
-        # type: int
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int64")
-            result_max = paddle.max(input=data, dim=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
-        self.assertEqual((res == np.max(input_data, axis=1)).all(), True)
-
-        # dygraph
-        with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            z = paddle.max(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.max(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
-
+            input_data = np.random.randint(10, size=(5, 5, 5)).astype(np.int32)
+            data = fluid.data("data", shape=[5, 5, 5], dtype="int32")
+            sum1 = paddle.sum(x=data, axis=[0, 1])
+            sum2 = paddle.sum(x=data, axis=())
 
-class API_TestMinOp(unittest.TestCase):
-    def test_1(self):
-        # type: float
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_min = paddle.min(input=data, dim=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
-        self.assertEqual((res == np.min(input_data, axis=1)).all(), True)
+            res1, res2 = exe.run(feed={"data": input_data},
+                                 fetch_list=[sum1, sum2])
 
-        # type: int
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int64")
-            result_min = paddle.min(input=data, dim=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
-        self.assertEqual((res == np.min(input_data, axis=1)).all(), True)
+        self.assertEqual((res1 == np.sum(input_data, axis=(0, 1))).all(), True)
+        self.assertEqual(
+            (res2 == np.sum(input_data, axis=(0, 1, 2))).all(), True)
 
-        # dygraph
+    def test_dygraph(self):
+        np_x = np.random.random([2, 3, 4]).astype('int32')
         with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
             x = fluid.dygraph.to_variable(np_x)
-            z = paddle.min(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.min(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
+            out0 = paddle.sum(x).numpy()
+            out1 = paddle.sum(x, axis=0).numpy()
+            out2 = paddle.sum(x, axis=(0, 1)).numpy()
+            out3 = paddle.sum(x, axis=(0, 1, 2)).numpy()
+
+        self.assertTrue((out0 == np.sum(np_x, axis=(0, 1, 2))).all())
+        self.assertTrue((out1 == np.sum(np_x, axis=0)).all())
+        self.assertTrue((out2 == np.sum(np_x, axis=(0, 1))).all())
+        self.assertTrue((out3 == np.sum(np_x, axis=(0, 1, 2))).all())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 3dfd9023f5af30ff289c4dc55a0c275402bc3067..275f9d21f9f8eca653a030bfe5c74071397f33c1 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -227,35 +228,43 @@ class TestReshapeUint8Op(TestReshapeInt8Op):
 
 # Test python API
 class TestReshapeAPI(unittest.TestCase):
-    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-    def test_1(self):
+    def _set_paddle_api(self):
+        self.fill_constant = paddle.fill_constant
+        self.data = paddle.data
+        self.reshape = paddle.reshape
+        self.to_tensor = paddle.to_tensor
+
+    def _set_fluid_api(self):
+        self.fill_constant = fluid.layers.fill_constant
+        self.data = fluid.data
+        self.reshape = fluid.layers.reshape
+
+    def _test_api(self):
         input = np.random.random([2, 25]).astype("float32")
         shape = [2, 5, 5]
-        positive_five = fluid.layers.fill_constant([1], "int32", 5)
-        x = fluid.layers.data(
-            name="x", shape=[2, 25], append_batch_size=False, dtype="float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            positive_five = self.fill_constant([1], "int32", 5)
+            x = self.data(name="x", shape=[2, 25], dtype="float32")
 
-        actual_shape = fluid.layers.data(
-            name="shape",
-            shape=[1, 3],
-            append_batch_size=False,
-            dtype="float32")
+            actual_shape = self.data(name="shape", shape=[3], dtype="int32")
 
-        # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-        out_1 = fluid.layers.reshape(x, shape)
+            # situation 1: have shape( list, no tensor), no actual shape(Tensor)
+            out_1 = self.reshape(x, shape)
 
-        # situation 2: have shape(list, no tensor), have actual shape(Tensor)
-        out_2 = fluid.layers.reshape(x, shape=shape, actual_shape=actual_shape)
+            # situation 2: have shape(list, no tensor), have actual shape(Tensor)
+            out_2 = fluid.layers.reshape(
+                x, shape=shape, actual_shape=actual_shape)
 
-        # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-        out_3 = fluid.layers.reshape(x, shape=[positive_five, 10])
+            # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+            out_3 = self.reshape(x, shape=[positive_five, 10])
 
-        # Situation 4: have shape(Tensor), no actual shape(Tensor)
-        out_4 = fluid.layers.reshape(x, shape=actual_shape)
+            # Situation 4: have shape(Tensor), no actual shape(Tensor)
+            out_4 = self.reshape(x, shape=actual_shape)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4 = exe.run(
-            fluid.default_main_program(),
+            main_prog,
             feed={"x": input,
                   "shape": np.array([2, 5, 5]).astype("int32")},
             fetch_list=[out_1, out_2, out_3, out_4])
@@ -265,76 +274,108 @@ class TestReshapeAPI(unittest.TestCase):
         assert np.array_equal(res_3, input.reshape([5, 10]))
         assert np.array_equal(res_4, input.reshape(shape))
 
+    def test_paddle_api(self):
+        self._set_paddle_api()
+        self._test_api()
+
+    def test_fluid_api(self):
+        self._set_fluid_api()
+        self._test_api()
+
+    def test_imperative(self):
+        self._set_paddle_api()
+        input = np.random.random([2, 25]).astype("float32")
+        shape = [2, 5, 5]
+        with fluid.dygraph.guard():
+            x = self.to_tensor(input)
+            positive_five = self.fill_constant([1], "int32", 5)
+
+            out_1 = self.reshape(x, shape)
+
+            out_2 = self.reshape(x, shape=[positive_five, 10])
+
+            shape_tensor = self.to_tensor(np.array([2, 5, 5]).astype("int32"))
+            out_3 = self.reshape(x, shape=shape_tensor)
+
+        assert np.array_equal(out_1.numpy(), input.reshape(shape))
+        assert np.array_equal(out_2.numpy(), input.reshape([5, 10]))
+        assert np.array_equal(out_3.numpy(), input.reshape(shape))
+
 
 # Test Input Error
 class TestReshapeOpError(unittest.TestCase):
-    def test_errors(self):
+    def _set_paddle_api(self):
+        self.data = paddle.data
+        self.reshape = paddle.reshape
+
+    def _set_fluid_api(self):
+        self.data = fluid.data
+        self.reshape = fluid.layers.reshape
+
+    def _test_errors(self):
         with program_guard(Program(), Program()):
             # The x type of reshape_op must be Variable.
             def test_x_type():
                 x1 = fluid.create_lod_tensor(
                     np.array([[-1]]), [[1]], fluid.CPUPlace())
-                fluid.layers.reshape(x1, shape=[1])
+                self.reshape(x1, shape=[1])
 
             self.assertRaises(TypeError, test_x_type)
 
             # The x dtype of reshape_op must be float16, float32, float64, int32 or int64.
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name="x2",
-                    shape=[2, 25],
-                    append_batch_size=False,
-                    dtype="bool")
-                fluid.layers.reshape(x2, shape=[2, 5, 5])
+                x2 = self.data(name="x2", shape=[2, 25], dtype="bool")
+                self.reshape(x2, shape=[2, 5, 5])
 
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_x_dtype_float16():
-                x_float16 = fluid.layers.data(
-                    name="x_float16",
-                    shape=[2, 25],
-                    append_batch_size=False,
-                    dtype="float16")
-                fluid.layers.reshape(x_float16, shape=[2, 5, 5])
+                x_float16 = self.data(
+                    name="x_float16", shape=[2, 25], dtype="float16")
+                self.reshape(x_float16, shape=[2, 5, 5])
 
             test_x_dtype_float16()
 
-            x3 = fluid.layers.data(
-                name="x3",
-                shape=[2, 25],
-                append_batch_size=False,
-                dtype="float32")
+            x3 = self.data(name="x3", shape=[2, 25], dtype="float32")
 
             # The argument shape's type of reshape_op must be list, tuple or Variable.
             def test_shape_type():
-                fluid.layers.reshape(x3, shape=1)
+                self.reshape(x3, shape=1)
 
             self.assertRaises(TypeError, test_shape_type)
 
             # The argument actual_shape's type of reshape_op must be Variable or None.
             def test_actual_shape_type():
-                fluid.layers.reshape(x3, shape=[25, 2], actual_shape=1)
+                self.reshape(x3, shape=[25, 2], actual_shape=1)
 
             self.assertRaises(TypeError, test_actual_shape_type)
 
             # The argument shape have more than one -1.
             def test_shape_1():
-                fluid.layers.reshape(x3, shape=[-1, -1, 5])
+                self.reshape(x3, shape=[-1, -1, 5])
 
             self.assertRaises(AssertionError, test_shape_1)
 
             # The argument shape have element 0 whose index exceed the input dimension.
             def test_shape_2():
-                fluid.layers.reshape(x3, [2, 5, 5, 0])
+                self.reshape(x3, [2, 5, 5, 0])
 
             self.assertRaises(AssertionError, test_shape_2)
 
             # The argument shape have more than one negative value.
             def test_shape_3():
-                fluid.layers.reshape(x3, [-1, -2, 5])
+                self.reshape(x3, [-1, -2, 5])
 
             self.assertRaises(AssertionError, test_shape_3)
 
+    def test_paddle_api_error(self):
+        self._set_paddle_api()
+        self._test_errors()
+
+    def test_fluid_api_error(self):
+        self._set_fluid_api()
+        self._test_errors()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index bc50cf197f63e6082ea1d3fdbff1891f500e5b9a..9abbee173852baf9db998aad3b71edabdb3e11ed 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -17,16 +17,16 @@ import paddle
 import paddle.fluid as fluid
 import unittest
 
-paddle.enable_imperative()
+paddle.disable_static()
 SEED = 2020
 np.random.seed(SEED)
-fluid.default_main_program().random_seed = SEED
+paddle.manual_seed(SEED)
 
 
 class Generator(fluid.dygraph.Layer):
     def __init__(self):
         super(Generator, self).__init__()
-        self.conv1 = paddle.nn.Conv2D(3, 3, 3, 1)
+        self.conv1 = paddle.nn.Conv2d(3, 3, 3, padding=1)
 
     def forward(self, x):
         x = self.conv1(x)
@@ -37,7 +37,7 @@ class Generator(fluid.dygraph.Layer):
 class Discriminator(fluid.dygraph.Layer):
     def __init__(self):
         super(Discriminator, self).__init__()
-        self.convd = paddle.nn.Conv2D(6, 3, 1)
+        self.convd = paddle.nn.Conv2d(6, 3, 1)
 
     def forward(self, x):
         x = self.convd(x)
@@ -60,8 +60,10 @@ class TestRetainGraph(unittest.TestCase):
                 interpolatesv = fake_data
             elif type == 'mixed':
                 alpha = paddle.rand((real_data.shape[0], 1))
-                alpha = paddle.expand(
-                    alpha, [1, np.prod(real_data.shape) // real_data.shape[0]])
+                alpha = paddle.expand(alpha, [
+                    real_data.shape[0],
+                    np.prod(real_data.shape) // real_data.shape[0]
+                ])
                 alpha = paddle.reshape(alpha, real_data.shape)
                 interpolatesv = alpha * real_data + ((1 - alpha) * fake_data)
             else:
@@ -73,7 +75,7 @@ class TestRetainGraph(unittest.TestCase):
 
             outs = paddle.fill_constant(disc_interpolates.shape,
                                         disc_interpolates.dtype, 1.0)
-            gradients = paddle.imperative.grad(
+            gradients = paddle.grad(
                 outputs=disc_interpolates,
                 inputs=fake_AB,
                 grad_outputs=outs,
@@ -90,12 +92,12 @@ class TestRetainGraph(unittest.TestCase):
         else:
             return 0.0, None
 
-    def test_retain(self):
+    def run_retain(self, need_retain):
         g = Generator()
         d = Discriminator()
 
-        optim_g = paddle.optimizer.Adam(parameter_list=g.parameters())
-        optim_d = paddle.optimizer.Adam(parameter_list=d.parameters())
+        optim_g = paddle.optimizer.Adam(parameters=g.parameters())
+        optim_d = paddle.optimizer.Adam(parameters=d.parameters())
 
         gan_criterion = paddle.nn.MSELoss()
         l1_criterion = paddle.nn.L1Loss()
@@ -103,8 +105,8 @@ class TestRetainGraph(unittest.TestCase):
         A = np.random.rand(2, 3, 32, 32).astype('float32')
         B = np.random.rand(2, 3, 32, 32).astype('float32')
 
-        realA = paddle.imperative.to_variable(A)
-        realB = paddle.imperative.to_variable(B)
+        realA = paddle.to_variable(A)
+        realB = paddle.to_variable(B)
         fakeB = g(realA)
 
         optim_d.clear_gradients()
@@ -117,7 +119,7 @@ class TestRetainGraph(unittest.TestCase):
             d, realA, fakeB, lambda_gp=10.0)
         loss_d = gan_criterion(G_pred_fake, false_target) + G_gradient_penalty
 
-        loss_d.backward(retain_graph=True)
+        loss_d.backward(retain_graph=need_retain)
         optim_d.minimize(loss_d)
 
         optim_g.clear_gradients()
@@ -130,6 +132,11 @@ class TestRetainGraph(unittest.TestCase):
         loss_g.backward()
         optim_g.minimize(loss_g)
 
+    def test_retain(self):
+        self.run_retain(need_retain=True)
+        self.assertRaises(
+            fluid.core.EnforceNotMet, self.run_retain, need_retain=False)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index eb12bc741767340a3e7e3580a8b95065d4267693..f7b9d4214d36a422a3ec94dc410e58c6c827ef4c 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
+import paddle
 
 
 def create_selected_rows_and_tensor(scope, place, height, row_num,
@@ -222,5 +223,59 @@ class TestRmspropOp(TestBase):
                         size=size)
 
 
+class TestRMSPropV2(unittest.TestCase):
+    def test_rmsprop_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.RMSProp(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_rmsprop(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.RMSProp, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.RMSProp,
+            learning_rate=0.1,
+            epsilon=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.RMSProp,
+            learning_rate=0.1,
+            momentum=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 6ca194b2694b6c7537ceb94e11eb1a1a0aeb8d8d..7e2ef36c1a7fda5c31049ec9c752c5226bfb89dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,7 +248,8 @@ class PolicyGradient(object):
             func=reward_func, x=[action, length], out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
-        cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
+        cost = (layers.reduce_sum(cost) /
+                layers.cast(layers.reduce_sum(length), "float32")
                 ) if length is not None else layers.reduce_mean(cost)
         optimizer = fluid.optimizer.Adam(self.lr)
         optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index 5e9c67c1a7a29b69a977cc94487fc3d26f24eeb8..ce3b060828ac475a10d61bf756423069ab0a70c1 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 from op_test import OpTest
 import paddle.fluid.core as core
 
@@ -173,5 +175,55 @@ class TestScatterOp5(OpTest):
             self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
 
 
+class TestScatterAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[3, 2], dtype="float64")
+            index = fluid.data(name="index", shape=[4], dtype="int64")
+            updates = fluid.data(name="updates", shape=[4, 2], dtype="float64")
+            result = paddle.scatter(input, index, updates, False)
+
+            input_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array(
+                [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input": input_data,
+                                  "index": index_data,
+                                  "updates": updates_data
+                              },
+                              fetch_list=[result])
+            self.assertEqual((fetches[0] == \
+                              np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
+                index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+                updates_data = np.array(
+                    [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+
+                x = fluid.dygraph.to_variable(x_data)
+                index = fluid.dygraph.to_variable(index_data)
+                updates = fluid.dygraph.to_variable(updates_data)
+
+                output1 = paddle.scatter(x, index, updates, overwrite=False)
+                self.assertEqual((output1.numpy() == \
+                                  np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 6070c84ff236274cc1778d0dce9ab40d884ce7ec..b5a2e84a53ef621f3be81b90d02c10d28fe18162 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -17,9 +17,26 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import six
+import paddle.fluid.core as core
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+def ref_selu(x,
+             scale=1.0507009873554804934193349852946,
+             alpha=1.6732632423543772848170429916717):
+    out = np.copy(x)
+    out_flat = out.flatten()
+    for i in range(out_flat.size):
+        if out_flat[i] < 0:
+            out_flat[i] = alpha * np.exp(out_flat[i]) - alpha
+        out_flat[i] = scale * out_flat[i]
+    out = out_flat.reshape(x.shape)
+    return out
 
 
 class SeluTest(OpTest):
@@ -39,17 +56,10 @@ class SeluTest(OpTest):
         # zero.
         x[np.abs(x) < 0.005] = 0.02
 
-        x_flat = x.flatten()
-
-        for i in range(x_flat.size):
-            if x_flat[i] < 0:
-                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
-            x_flat[i] = scale * x_flat[i]
-
-        out_np = x_flat.reshape(self.x_shape)
+        out = ref_selu(x, scale, alpha)
 
         self.inputs = {'X': x}
-        self.outputs = {'Out': out_np}
+        self.outputs = {'Out': out}
 
         self.attrs = {
             'alpha': alpha,
@@ -69,17 +79,65 @@ class SeluTest(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestSeluOpError(unittest.TestCase):
+class TestSeluAPI(unittest.TestCase):
+    # test paddle.nn.SELU, paddle.nn.functional.selu
+    def setUp(self):
+        self.scale = 1.5
+        self.alpha = 2.0
+        self.x_np = np.random.normal(size=[3, 5, 5, 10]).astype(np.float64)
+        # Since zero point in selu is not differentiable, avoid randomize
+        # zero.
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.selu(x, self.scale, self.alpha)
+            selu = paddle.nn.SELU(self.scale, self.alpha)
+            out2 = selu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.selu(x, self.scale, self.alpha)
+        selu = paddle.nn.SELU(self.scale, self.alpha)
+        out2 = selu(x)
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.selu(x, self.scale, self.alpha)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.selu, 1)
+            self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.selu, x_int32)
-            # support the input dtype is float32
-            x_fp32 = fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
-            fluid.layers.selu(x_fp32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.selu, x_int32)
+            # The scale must be greater than 1.0
+            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
+            # The alpha must be no less than 0
+            self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.selu(x_fp16)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index b84e3b5377f2796803707dfd68cd5450c512fce7..da5080eabddc93f0c3d08f16e0a7c20b52af47e0 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -54,5 +55,32 @@ class TestSignOpError(unittest.TestCase):
             fluid.layers.sign(input4)
 
 
+class TestSignAPI(unittest.TestCase):
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([-1., 0., -0., 1.2, 1.5], dtype='float64')
+            x = paddle.to_tensor(np_x)
+            z = paddle.sign(x)
+            np_z = z.numpy()
+            z_expected = np.sign(np_x)
+            self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_static(self):
+        with program_guard(Program(), Program()):
+            # The input type of sign_op must be Variable or numpy.ndarray.
+            input1 = 12
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
+            # The input dtype of sign_op must be float16, float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[12, 10], dtype="int32")
+            input3 = fluid.layers.data(
+                name='input3', shape=[12, 10], dtype="int64")
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input2)
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input3)
+            input4 = fluid.layers.data(
+                name='input4', shape=[4], dtype="float16")
+            paddle.sign(input4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 5ccc667799320d1ef7704b1e4416d9685ec6ecd2..fdcd2d350a6fac115086b5677a972f8b1145ff95 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -663,5 +663,21 @@ class TestImperativeVarBaseGetItem(unittest.TestCase):
         self.assertRaises(Exception, test_float_in_index)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestImperativeCUDAPinnedInput(unittest.TestCase):
+    def test_input_cuda_pinned_var(self):
+        with fluid.dygraph.guard():
+            data = np.random.random((2, 80, 16128)).astype('float32')
+            var = core.VarBase(
+                value=data,
+                name='',
+                persistable=False,
+                place=fluid.CUDAPinnedPlace(),
+                zero_copy=False)
+            sliced = var[:, 10:, :var.shape[1]]
+            self.assertEqual(sliced.shape, [2, 70, 80])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a97f57aaae5f290b20e34242b1b43e5e352223d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+def smooth_l1_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+def smooth_l1_loss_np(input, label, reduction='mean', delta=1.0):
+    diff = input - label
+    out = np.vectorize(smooth_l1_loss_forward)(diff, delta)
+    if reduction == 'sum':
+        return np.sum(out)
+    elif reduction == 'mean':
+        return np.mean(out)
+    elif reduction == 'none':
+        return out
+
+
+class SmoothL1Loss(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+
+    def test_smooth_l1_loss_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='mean')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='sum')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_delta(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        delta = np.random.rand()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, delta=delta)
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 1df50d63e3f67424ed1f42b94c317030ed69c6e9..04d5cc941a4636da0352fe9221cdad8bdfcd2bd9 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -20,6 +20,10 @@ from op_test import OpTest
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+
+np.random.seed(10)
 
 
 def stable_softmax(x):
@@ -31,6 +35,15 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
 class TestSoftmaxOp(OpTest):
     def get_x_shape(self):
         return [10, 10]
@@ -89,20 +102,6 @@ class TestSoftmaxOp(OpTest):
                 check_dygraph=(self.use_mkldnn == False))
 
 
-class TestSoftmaxOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.softmax, x1)
-            # The input dtype of softmax_op must be float16, float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.softmax, x2)
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="float16")
-            fluid.layers.softmax(x3)
-
-
 class TestSoftmaxOp2(TestSoftmaxOp):
     def get_x_shape(self):
         return [2, 3, 4, 5]
@@ -220,5 +219,60 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
+class TestSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32')
+        self.out_ref = np.apply_along_axis(stable_softmax, -1, self.x_np)
+
+    def test_static_check(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, 'float32')
+            out1 = F.softmax(x)
+            m = paddle.nn.Softmax()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=None)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_check(self):
+        paddle.disable_static(self.place)
+
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softmax(x)
+        m = paddle.nn.Softmax()
+        out2 = m(x)
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=None)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.softmax(x, axis=0)
+        m = paddle.nn.Softmax(axis=0)
+        out2 = m(x)
+        out_ref = ref_softmax(self.x_np, axis=0, dtype=None)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out = F.softmax(x, dtype=np.float64)
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float64)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        paddle.enable_static()
+
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softmax, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, F.softmax, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            F.softmax(x_fp16)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index 087586aa89607a58493c2d4427cbb6d30b31f0da..015b72fd1c5275f758a109451110f61b97c4a0c7 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import paddle
 import paddle.fluid as fluid
-import paddle.imperative as imperative
 import paddle.fluid.layers as layers
 import numpy as np
 import six
@@ -72,16 +71,17 @@ class TestSortDygraph(unittest.TestCase):
             self.place = core.CPUPlace()
 
     def test_api_0(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.sort(var_x)
-            self.assertEqual((np.sort(self.input_data) == out.numpy()).all(),
-                             True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.sort(var_x)
+        self.assertEqual((np.sort(self.input_data) == out.numpy()).all(), True)
+        paddle.enable_static()
 
     def test_api_1(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.sort(var_x, axis=-1)
-            self.assertEqual(
-                (np.sort(
-                    self.input_data, axis=-1) == out.numpy()).all(), True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.sort(var_x, axis=-1)
+        self.assertEqual(
+            (np.sort(
+                self.input_data, axis=-1) == out.numpy()).all(), True)
+        paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_std_layer.py b/python/paddle/fluid/tests/unittests/test_std_layer.py
index d1e0056304204bf0dbe47982bbf4b9574acf8eac..e455151481443c1fb918efd9e44444536adc6b7f 100644
--- a/python/paddle/fluid/tests/unittests/test_std_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_std_layer.py
@@ -15,65 +15,104 @@
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 
 
-class TestStdLayer(unittest.TestCase):
+def ref_std(x, axis=None, unbiased=True, keepdim=False):
+    ddof = 1 if unbiased else 0
+    if isinstance(axis, int):
+        axis = (axis, )
+    if axis is not None:
+        axis = tuple(axis)
+    return np.std(x, axis=axis, ddof=ddof, keepdims=keepdim)
+
+
+class TestStdAPI(unittest.TestCase):
     def setUp(self):
-        self._dtype = "float64"
-        self._input = np.random.random([2, 3, 4, 5]).astype(self._dtype)
-
-    def static(self, axis=None, keepdim=False, unbiased=True):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            data = fluid.data(
-                name="data", dtype=self._dtype, shape=[None, 3, 4, 5])
-            out = prog.current_block().create_var(
-                dtype=self._dtype, shape=[2, 3, 4, 5])
-            paddle.std(input=data,
-                       axis=axis,
-                       keepdim=keepdim,
-                       unbiased=unbiased,
-                       out=out)
-
-        exe = fluid.Executor(self._place)
-        return exe.run(feed={"data": self._input},
-                       program=prog,
-                       fetch_list=[out])[0]
-
-    def dynamic(self, axis=None, keepdim=False, unbiased=True):
-        with fluid.dygraph.guard(self._place):
-            data = fluid.dygraph.to_variable(self._input)
-            out = paddle.std(input=data,
-                             axis=axis,
-                             keepdim=keepdim,
-                             unbiased=unbiased)
-            return out.numpy()
-
-    def numpy(self, axis=None, keepdim=False, unbiased=True):
-        ddof = 1 if unbiased else 0
-        axis = tuple(axis) if isinstance(axis, list) else axis
-        return np.std(self._input, axis=axis, keepdims=keepdim, ddof=ddof)
-
-    def test_equal(self):
-        places = []
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self._place = place
-            self.assertTrue(np.allclose(self.numpy(), self.static()))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(axis=[0, 2]), self.dynamic(axis=[0, 2])))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(
-                        axis=[1, 3], keepdim=True),
-                    self.dynamic(
-                        axis=[1, 3], keepdim=True)))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(unbiased=False), self.dynamic(unbiased=False)))
+        self.dtype = 'float64'
+        self.shape = [1, 3, 4, 10]
+        self.axis = [1, 3]
+        self.keepdim = False
+        self.unbiased = True
+        self.set_attrs()
+        self.x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        pass
+
+    def static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape, self.dtype)
+            out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        return res[0]
+
+    def dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
+        paddle.enable_static()
+        return out.numpy()
+
+    def test_api(self):
+        out_ref = ref_std(self.x, self.axis, self.unbiased, self.keepdim)
+        out_dygraph = self.dygraph()
+        out_static = self.static()
+        for out in [out_dygraph, out_static]:
+            self.assertTrue(np.allclose(out_ref, out))
+            self.assertTrue(np.equal(out_ref.shape, out.shape).all())
+
+
+class TestStdAPI_dtype(TestStdAPI):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestStdAPI_axis_int(TestStdAPI):
+    def set_attrs(self):
+        self.axis = 2
+
+
+class TestStdAPI_axis_list(TestStdAPI):
+    def set_attrs(self):
+        self.axis = [1, 2]
+
+
+class TestStdAPI_axis_tuple(TestStdAPI):
+    def set_attrs(self):
+        self.axis = (1, 3)
+
+
+class TestStdAPI_keepdim(TestStdAPI):
+    def set_attrs(self):
+        self.keepdim = False
+
+
+class TestStdAPI_unbiased(TestStdAPI):
+    def set_attrs(self):
+        self.unbiased = False
+
+
+class TestStdAPI_alias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([10, 12], 'float32'))
+        out1 = paddle.std(x).numpy()
+        out2 = paddle.tensor.std(x).numpy()
+        out3 = paddle.tensor.stat.std(x).numpy()
+        self.assertTrue(np.allclose(out1, out2))
+        self.assertTrue(np.allclose(out1, out3))
+        paddle.enable_static()
+
+
+class TestStdError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [2, 3, 4], 'int32')
+            self.assertRaises(TypeError, paddle.std, x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 8fd118c0193035fce294aa6ac23951d57ba43f78..b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -22,9 +22,11 @@ import unittest
 import numpy as np
 import os
 import six
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
+from paddle.fluid import Program, program_guard
 
 from op_test import OpTest, _set_use_system_allocator
 
@@ -202,5 +204,22 @@ class TestFP16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
         self.atol = 1e-2
 
 
+class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
+    def test_errors(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
+            self.assertRaises(TypeError, my_sync_batch_norm, x1)
+
+            # the input dtype of SyncBatchNorm must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
+            self.assertRaises(TypeError, my_sync_batch_norm, x2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aaf31993448ab0ff0c69f648cfa84c62d3e198b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -0,0 +1,251 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+# Situation 1: repeat_times is a list (without tensor)
+class TestTileOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.attrs = {'repeat_times': self.repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+# with dimension expanding
+class TestTileOpRank2Expanding(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.repeat_times = [2, 2]
+
+
+class TestTileOpRank2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+class TestTileOpRank3_Corner(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (1, 1, 1)
+
+
+class TestTileOpRank3_Corner2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (2, 2)
+
+
+class TestTileOpRank3(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 15)
+        self.repeat_times = (2, 1, 4)
+
+
+class TestTileOpRank4(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.repeat_times = (3, 2, 1, 2)
+
+
+# Situation 2: repeat_times is a list (with tensor)
+class TestTileOpRank1_tensor_attr(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.init_data()
+        repeat_times_tensor = []
+        for index, ele in enumerate(self.repeat_times):
+            repeat_times_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'repeat_times_tensor': repeat_times_tensor,
+        }
+        self.attrs = {"repeat_times": self.infer_repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+        self.infer_repeat_times = [-1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [1, 1]
+        self.infer_repeat_times = [1, -1]
+
+
+class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+        self.infer_repeat_times = [-1, 3]
+
+
+# Situation 3: repeat_times is a tensor
+class TestTileOpRank1_tensor(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+# Situation 4: input x is Integer
+class TestTileOpInteger(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(4, 4, 5)).astype("int32")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 5: input x is Bool
+class TestTileOpBoolean(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 56: input x is Integer
+class TestTileOpInt64_t(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int64")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTileError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            repeat_times = [2, 2]
+            self.assertRaises(TypeError, paddle.tile, x1, repeat_times)
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
+            self.assertRaises(TypeError, paddle.tile, x2, repeat_times)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tile, x3, repeat_times)
+
+
+class TestTileAPIStatic(unittest.TestCase):
+    def test_api(self):
+        with program_guard(Program(), Program()):
+            repeat_times = [2, 2]
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype="int32")
+            out = paddle.tile(x1, repeat_times)
+            positive_2 = fluid.layers.fill_constant([1], dtype="int32", value=2)
+            out2 = paddle.tile(x1, repeat_times=[positive_2, 2])
+
+
+# Test python API
+class TestTileAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard():
+            np_x = np.random.random([12, 14]).astype("float32")
+            x = paddle.to_variable(np_x)
+
+            positive_2 = np.array([2]).astype("int32")
+            positive_2 = paddle.to_variable(positive_2)
+
+            repeat_times = np.array([2, 3]).astype("int32")
+            repeat_times = paddle.to_variable(repeat_times)
+
+            out_1 = paddle.tile(x, repeat_times=[2, 3])
+            out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
+            out_3 = paddle.tile(x, repeat_times=repeat_times)
+
+            assert np.array_equal(out_1.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_2.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e7765c0fb76844a6123fceea6c1ef79dc0c2bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -0,0 +1,244 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkOp(OpTest):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 20)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestTopOp1(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = True
+
+
+class TestTopOp2(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp3(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp4(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkOp5(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp6(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8d1e77134036bf7b28d4afb8bacaa44092b1053
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
+
+import unittest
+
+
+def generate_basic_params(mode="attn", self_attention=True):
+    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
+    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
+    attn_dropout = 0.0
+    embed_dim = d_head * num_heads
+    if mode == "attn":
+        if self_attention:
+            kdim, vdim = embed_dim, embed_dim
+            key_length, value_length = query_length, query_length
+        else:
+            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
+            key_length = np.random.randint(2, 10)
+            value_length = key_length
+        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
+
+    else:
+        dropout, act_dropout = 0.0, 0.0
+        dim_feedforward = np.random.randint(128, 1024)
+        sequence_length = np.random.randint(2, 10)
+        if mode == "encoder_layer":
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
+        elif mode == "decoder_layer":
+            target_length = np.random.randint(2, 10)
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
+
+
+def generate_query_key_value_cache(self_attention,
+                                   batch_size,
+                                   num_heads,
+                                   query_length,
+                                   embed_dim,
+                                   key_length=None,
+                                   value_length=None,
+                                   kdim=None,
+                                   vdim=None,
+                                   cache=None):
+    query = np.random.rand(batch_size, query_length,
+                           embed_dim).astype("float32")
+    attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
+    attn_mask[0][0][0][0] = -1e9
+
+    head_dim = embed_dim // num_heads
+    if self_attention:
+        key, value = query, query
+    else:
+        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
+        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
+    cache_dict = {}
+    if cache:
+        if not self_attention:
+            cache_dict["static_k"] = np.random.rand(
+                batch_size, num_heads, key_length, head_dim).astype("float32")
+            cache_dict["static_v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+        else:
+            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
+                                             head_dim).astype("float32")
+            cache_dict["v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+    else:
+        cache_dict = None
+    return query, key, value, attn_mask, cache_dict
+
+
+def fc(x, weight):
+    return np.matmul(x, weight)
+
+
+def softmax(x):
+    np.seterr(invalid='ignore')
+    output = np.zeros(x.shape, dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            for k in range(x.shape[2]):
+                x_curr = x[i, j, k, :]
+                e_x = np.exp(x_curr - np.amax(x_curr))
+                output[i, j, k, :] = e_x / np.sum(e_x)
+    return output
+
+
+def batch_matmul(x, y):
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] == y.shape[1]
+    retval = np.zeros(
+        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
+    return retval
+
+
+def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
+    k = k.transpose([0, 1, 3, 2])
+    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
+    if attn_mask is not None:
+        qkt += attn_mask
+    weight = softmax(qkt)
+    attn_heads = batch_matmul(weight, v)
+    attn_heads = attn_heads.transpose((0, 2, 1, 3))
+    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
+                                     attn_heads.shape[2] * attn_heads.shape[3]))
+    return attn_heads
+
+
+def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
+    with fluid.dygraph.guard():
+        head_dim = embed_dim // num_heads
+        k_weight = multi_head_attn.k_proj.weight.numpy()
+        v_weight = multi_head_attn.v_proj.weight.numpy()
+        k = fc(key, k_weight)
+        v = fc(value, v_weight)
+        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
+        k = k.transpose((0, 2, 1, 3))
+        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
+        v = v.transpose((0, 2, 1, 3))
+        return k, v
+
+
+def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
+                multi_head_attn, cache_dict):
+    q_weight = multi_head_attn.q_proj.weight.numpy()
+    q = fc(query, q_weight)
+    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
+    q = q.transpose((0, 2, 1, 3))
+
+    if not self_attention and cache_dict:
+        k, v = cache_dict["static_k"], cache_dict["static_v"]
+    else:
+        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
+        if cache_dict is not None:
+            k = np.concatenate((cache_dict["k"], k), axis=2)
+            v = np.concatenate((cache_dict["v"], v), axis=2)
+    return (q, k, v, cache_dict)
+
+
+def add(x, y=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        x = x.numpy() if not isinstance(x, np.ndarray) else x
+        if y is not None:
+            x += y
+            return x
+        return x
+
+
+def relu(x):
+    compare = x > 0
+    return x * compare
+
+
+def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        # scale:
+        weight = norm.weight.numpy()
+        # shift:
+        bias = norm.bias.numpy()
+
+        batch_size, src_len, d_model = x.shape
+        x = x.reshape((batch_size * src_len, d_model))
+        mu = np.mean(x, axis=1, keepdims=True)
+        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+        x1_up = (x - mu)
+        x1_down_1 = sigma_squar + epsilon
+        x1_down = np.sqrt(x1_down_1)
+        x1_down = x1_down.reshape((x1_down.shape[0], 1))
+        x1 = x1_up / x1_down
+        x_scaled = weight * x1
+        x_scaled_bias = x_scaled + bias
+        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def ffn(src, encoder_layer, ffn_fc1_act="relu"):
+    assert ffn_fc1_act == "relu", "only relu is supported"
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        src = src.numpy() if not isinstance(src, np.ndarray) else src
+        w1 = encoder_layer.linear1.weight.numpy()
+        w2 = encoder_layer.linear2.weight.numpy()
+        # fc1
+        x1 = fc(src, w1)
+        x1 = relu(x1)
+        # fc2
+        x2 = fc(x1, w2)
+        return x2
+
+
+class TestTransformer(unittest.TestCase):
+    def test_multi_head_attention(self):
+        def multihead_attention_test_helper(self_attention, cache):
+            paddle.framework.manual_seed(2020)
+            # self_attention|cross_attention, cache|No cache
+            with fluid.dygraph.guard(fluid.CPUPlace()):
+
+                # generate params for multi_head_attention
+                batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
+                    "attn", self_attention)
+                query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
+                    self_attention, batch_size, num_heads, query_length,
+                    embed_dim, key_length, value_length, kdim, vdim, cache)
+                if cache and self_attention:
+                    attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
+                need_weight, param_attr, bias_attr = False, None, None
+                # call paddle's function
+                multi_head_attn = MultiHeadAttention(
+                    embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
+                    param_attr, bias_attr)
+                # construct cache object
+                cache_obj = None
+                if cache_dict:
+                    if 'k' and 'v' in cache_dict:
+                        cache_obj = multi_head_attn.Cache(
+                            paddle.to_variable(cache_dict['k']),
+                            paddle.to_variable(cache_dict['v']))
+                    elif 'static_k' and 'static_v' in cache_dict:
+                        cache_obj = multi_head_attn.StaticCache(
+                            paddle.to_variable(cache_dict['static_k']),
+                            paddle.to_variable(cache_dict['static_v']))
+                if attn_mask is not None:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value),
+                        paddle.to_variable(attn_mask), cache_obj)
+                else:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value), attn_mask, cache_obj)
+                attn_output = attn_output[0] if cache_dict else attn_output
+
+                # implementation by numpy
+                # compute q, k, v
+                q, k, v, _ = prepare_qkv(query, key, value, num_heads,
+                                         embed_dim, self_attention,
+                                         multi_head_attn, cache_dict)
+                # scale dot product attention
+                attn_heads = scaled_dot_product_attention(
+                    q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn)
+                out_proj_weight = multi_head_attn.out_proj.weight.numpy()
+                reference = fc(attn_heads, out_proj_weight)
+
+                np.testing.assert_allclose(
+                    attn_output.numpy(), reference, atol=1e-6)
+
+        multihead_attention_test_helper(True, True)
+        multihead_attention_test_helper(True, False)
+        multihead_attention_test_helper(False, True)
+        multihead_attention_test_helper(False, False)
+
+    def test_transformer_encoder_layer(self):
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+
+            ffn_fc1_act = "relu"
+            # 1.generate basic params
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+                mode="encoder_layer")
+            # 2.generate input for encoder
+            src = np.random.rand(batch_size, sequence_length,
+                                 d_model).astype("float32")
+            residual = src
+            src_mask = np.zeros((batch_size, n_head, sequence_length,
+                                 sequence_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+
+            # paddle
+            encoder_layer = TransformerEncoderLayer(
+                d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
+                attn_dropout, act_dropout)
+
+            encoder_output = encoder_layer(
+                paddle.to_variable(src),
+                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+            # 4.numpy:
+            # paddle self attention
+            self_attn = MultiHeadAttention(
+                d_model, n_head, dropout=attn_dropout)
+            attn_output = self_attn(
+                paddle.to_variable(src),
+                paddle.to_variable(src),
+                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+
+            src = attn_output + residual
+            src_norm = layer_norm(src, d_model, encoder_layer.norm1)
+            residual = src_norm
+
+            ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
+            src = residual + ffn_output
+            src = layer_norm(src, d_model, encoder_layer.norm2)
+
+            np.testing.assert_allclose(
+                encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
+
+    def test_transformer_decoder_layer(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            activation = "relu"
+            normalize_before = False
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
+                mode="decoder_layer")
+            tgt = np.random.rand(batch_size, target_length,
+                                 d_model).astype("float32")
+            memory = np.random.rand(batch_size, source_length,
+                                    d_model).astype("float32")
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            for cache in [True, False]:
+                self_attn = MultiHeadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+                cross_attn = MultiHeadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+
+                # paddle decoderlayer:
+                decoder_layer = TransformerDecoderLayer(
+                    d_model, n_head, dim_feedforward, dropout, activation,
+                    attn_dropout, act_dropout, normalize_before)
+                cache_objs = None
+                if cache:
+                    cache_objs = decoder_layer.gen_cache(
+                        paddle.to_variable(memory))
+
+                decoder_output = decoder_layer(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(tgt_mask),
+                    paddle.to_variable(memory_mask), cache_objs)
+
+                decoder_output = decoder_output[0].numpy(
+                ) if cache else decoder_output.numpy()
+
+                # numpy:
+                residual = tgt
+                # self-attn
+                self_attn_cache = cache_objs[
+                    0] if cache_objs is not None else None
+                tgt = self_attn(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt_mask), self_attn_cache)
+
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                tgt = residual + tgt
+                # postprocess
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
+                residual = tgt_norm
+                # cross-attn
+                cross_attn_cache = cache_objs[
+                    1] if cache_objs is not None else None
+                tgt = cross_attn(
+                    paddle.to_variable(tgt_norm),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory_mask), cross_attn_cache)
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                # postprocess
+                tgt = tgt + residual
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
+                residual = tgt_norm
+                # FFN
+                ffn_output = ffn(tgt_norm, decoder_layer, activation)
+                # post process
+                tgt = residual + ffn_output
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
+
+                np.testing.assert_allclose(
+                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
+
+    def test_encoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+            mode="encoder_layer")
+
+        src = np.random.rand(batch_size, sequence_length,
+                             d_model).astype("float32")
+
+        src_mask = np.zeros((batch_size, n_head, sequence_length,
+                             sequence_length)).astype("float32")
+        src_mask[0][0][0][0] = -np.inf
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            encoder = TransformerEncoder(encoder_layer, num_layers)
+            # src, src_mask
+            enc_output = encoder(
+                paddle.to_variable(src), paddle.to_variable(src_mask))
+
+    def test_decoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+        tgt = np.random.rand(batch_size, target_length,
+                             d_model).astype("float32")
+        memory = np.random.rand(batch_size, source_length,
+                                d_model).astype("float32")
+        tgt_mask = np.zeros((batch_size, n_head, target_length,
+                             target_length)).astype("float32")
+        tgt_mask[0][0][0][0] = -1e9
+        memory_mask = np.zeros((batch_size, n_head, target_length,
+                                source_length)).astype("float32")
+        memory_mask[0][0][0][0] = -1e9
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            decoder_layer = TransformerDecoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            decoder = TransformerDecoder(decoder_layer, num_layers)
+
+            output = decoder(
+                paddle.to_variable(tgt),
+                paddle.to_variable(memory),
+                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+
+    def test_transformer(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..49924b44441aa9ae323f0d7921d71bf58b8c2cf2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -0,0 +1,681 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+
+
+def trilinear_interp_np(input,
+                        out_d,
+                        out_h,
+                        out_w,
+                        out_size=None,
+                        actual_shape=None,
+                        align_corners=True,
+                        align_mode=0,
+                        data_layout='NCDHW'):
+    """trilinear interpolation implement in shape [N, C, D, H, W]"""
+    if data_layout == "NDHWC":
+        input = np.transpose(input, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+    if out_size is not None:
+        out_d = out_size[0]
+        out_h = out_size[1]
+        out_w = out_size[2]
+    if actual_shape is not None:
+        out_d = actual_shape[0]
+        out_h = actual_shape[1]
+        out_w = actual_shape[2]
+    batch_size, channel, in_d, in_h, in_w = input.shape
+
+    ratio_d = ratio_h = ratio_w = 0.0
+    if out_d > 1:
+        if (align_corners):
+            ratio_d = (in_d - 1.0) / (out_d - 1.0)
+        else:
+            ratio_d = 1.0 * in_d / out_d
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_d, out_h, out_w))
+
+    for i in range(out_d):
+        if (align_mode == 0 and not align_corners):
+            d = int(ratio_d * (i + 0.5) - 0.5)
+        else:
+            d = int(ratio_d * i)
+
+        d = max(0, d)
+        did = 1 if d < in_d - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_d = max(ratio_d * (i + 0.5) - 0.5, 0)
+            d1lambda = idx_src_d - d
+        else:
+            d1lambda = ratio_d * i - d
+        d2lambda = 1.0 - d1lambda
+
+        for j in range(out_h):
+            if (align_mode == 0 and not align_corners):
+                h = int(ratio_h * (j + 0.5) - 0.5)
+            else:
+                h = int(ratio_h * j)
+
+            h = max(0, h)
+            hid = 1 if h < in_h - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_h = max(ratio_h * (j + 0.5) - 0.5, 0)
+                h1lambda = idx_src_h - h
+            else:
+                h1lambda = ratio_h * j - h
+            h2lambda = 1.0 - h1lambda
+
+            for k in range(out_w):
+                if (align_mode == 0 and not align_corners):
+                    w = int(ratio_w * (k + 0.5) - 0.5)
+                else:
+                    w = int(ratio_w * k)
+                w = max(0, w)
+                wid = 1 if w < in_w - 1 else 0
+                if (align_mode == 0 and not align_corners):
+                    idx_src_w = max(ratio_w * (k + 0.5) - 0.5, 0)
+                    w1lambda = idx_src_w - w
+                else:
+                    w1lambda = ratio_w * k - w
+                w2lambda = 1.0 - w1lambda
+
+                out[:, :, i, j, k] = \
+                    d2lambda * \
+                    (h2lambda * (w2lambda * input[:, :, d, h, w] + \
+                              w1lambda * input[:, :, d, h, w+wid]) + \
+                    h1lambda * (w2lambda * input[:, :, d, h+hid, w] + \
+                              w1lambda * input[:, :, d, h+hid, w+wid])) + \
+                    d1lambda * \
+                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] + \
+                              w1lambda * input[:, :, d+did, h, w+wid]) + \
+                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
+                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
+
+    return out.astype(input.dtype)
+
+
+class TestTrilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCDHW'
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        if self.data_layout == "NCDHW":
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(in_d * scale_d)
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = trilinear_interp_np(
+            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
+            self.align_corners, self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        # c++ end treat NCDHW the same way as NCHW
+        if self.data_layout == 'NCDHW':
+            data_layout = 'NCHW'
+        else:
+            data_layout = 'NHWC'
+        self.attrs = {
+            'out_d': self.out_d,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase1(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 1, 7, 8, 9]
+        self.out_d = 1
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase2(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 9, 6, 8]
+        self.out_d = 12
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase3(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 16, 8, 4]
+        self.out_d = 32
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase4(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [4, 1, 7, 8, 9]
+        self.out_d = 1
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase5(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 3, 9, 6, 8]
+        self.out_d = 12
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase6(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 8
+        self.out_h = 32
+        self.out_w = 16
+        self.scale = 0.
+        self.out_size = np.array([17, 9, 5]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpSame(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 16
+        self.out_h = 8
+        self.out_w = 4
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 8
+        self.out_h = 8
+        self.out_w = 4
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 16, 8, 4]
+        self.out_d = 64
+        self.out_h = 32
+        self.out_w = 16
+        self.scale = 0.
+        self.out_size = np.array([33, 19, 7]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 4, 4, 4, 3]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NDHWC"
+
+
+class TestTrilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(self.input_shape[2] * scale_d)
+            out_h = int(self.input_shape[3] * scale_h)
+            out_w = int(self.input_shape[4] * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_d': self.out_d,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 3, 9, 6, 8]
+        self.out_d = 13
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 16, 8, 4]
+        self.out_d = 13
+        self.out_h = 7
+        self.out_w = 2
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [4, 1, 7, 8, 9]
+        self.out_d = 3
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15, 21]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestTrilinearInterpScale1(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 82
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpScale2(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpScale3(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpZero(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 11]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestTrilinearInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(self.input_shape[2] * scale_d)
+            out_h = int(self.input_shape[3] * scale_h)
+            out_w = int(self.input_shape[4] * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_d'] = self.out_d
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_d = 2
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [2, 3, 3]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+# out_size is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 9, 6, 8]
+        self.out_d = 32
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 0.3
+        self.out_size = [12, 4, 4]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+# scale is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 8, 8, 4]
+        self.out_d = 16
+        self.out_h = 12
+        self.out_w = 4
+        self.scale = 0.
+        self.out_size = [16, 4, 10]
+        self.align_corners = True
+        self.align_mode = 1
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 8, 8, 4]
+        self.out_d = 16
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.align_mode = 1
+        self.scale_by_1Dtensor = True
+
+
+class TestTrilinearInterpAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 9, 4], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 9, 4, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[3], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_trilinear(
+            y, out_shape=[12, 18, 8], data_format='NDHWC')
+        out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
+        out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_trilinear(
+            x, out_shape=[4, 4, 8], actual_shape=actual_size)
+        out5 = fluid.layers.resize_trilinear(x, scale=scale_tensor)
+        out6 = interpolate(
+            x, scale_factor=scale_tensor, mode='trilinear', data_format="NCDHW")
+        out7 = interpolate(
+            x, size=[4, 4, 8], mode='trilinear', data_format="NCDHW")
+        out8 = interpolate(
+            x, size=shape_tensor, mode='trilinear', data_format="NCDHW")
+
+        x_data = np.random.random((2, 3, 6, 9, 4)).astype("float32")
+        dim_data = np.array([18]).astype("int32")
+        shape_data = np.array([12, 18, 8]).astype("int32")
+        actual_size_data = np.array([12, 18, 8]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 4, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = trilinear_interp_np(
+            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+
+
+class TestTrilinearInterpOpException(unittest.TestCase):
+    def test_exception(self):
+        input = fluid.data(name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+
+        def attr_data_format():
+            # for 5-D input, data_format only can be NCDHW or NDHWC
+            out = fluid.layers.resize_trilinear(
+                input, out_shape=[4, 8, 4], data_format='NHWC')
+
+        self.assertRaises(ValueError, attr_data_format)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 9a64dd1deea93f473d73d485ec5a9d707aaa54f9..158462a1e6e1012b7473a2410f2c003d04ea2e40 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -14,9 +14,12 @@
 
 from __future__ import print_function
 
+import sys
+import subprocess
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
@@ -472,5 +475,61 @@ class TestUniformRandomBatchSizeLikeOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestUniformAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.uniform([2, 3], min=-5.0, max=5.0)
+        paddle.tensor.uniform([2, 3], min=-5.0, max=5.0)
+        paddle.tensor.random.uniform([2, 3], min=-5.0, max=5.0)
+
+        def test_uniform_random():
+            paddle.tensor.random.uniform_random([2, 3], min=-5.0, max=5.0)
+
+        self.assertRaises(AttributeError, test_uniform_random)
+
+
+class TestUniformOpError(unittest.TestCase):
+    def test_errors(self):
+        main_prog = Program()
+        start_prog = Program()
+        with program_guard(main_prog, start_prog):
+
+            def test_Variable():
+                x1 = fluid.create_lod_tensor(
+                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.tensor.random.uniform(x1)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_Variable2():
+                x1 = np.zeros((4, 784))
+                paddle.tensor.random.uniform(x1)
+
+            self.assertRaises(TypeError, test_Variable2)
+
+            def test_dtype():
+                x2 = fluid.layers.data(
+                    name='x2', shape=[4, 784], dtype='float32')
+                paddle.tensor.random.uniform(x2, 'int32')
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_out_dtype():
+                out = paddle.tensor.random.uniform(
+                    shape=[3, 4], dtype='float64')
+                self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+            test_out_dtype()
+
+
+class TestUniformDygraphMode(unittest.TestCase):
+    def test_check_output(self):
+        with fluid.dygraph.guard():
+            x = paddle.tensor.random.uniform(
+                [10], dtype="float32", min=0.0, max=1.0)
+            x_np = x.numpy()
+            for i in range(10):
+                self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 65194524adfcd7f48efe0f2378c95dba7a8c4a5e..a2c60d870e5e13fd161945fe0abe9b3ab82cc82c 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -125,5 +126,185 @@ class TestRandomGPU(TestUniqueOp):
             self.check_output_with_place(place, atol=1e-5)
 
 
+class TestSortedUniqueOp(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
+        unique, indices, inverse, count = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": count,
+        }
+
+
+class TestUniqueOpAxisNone(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueOpAxis1(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=1)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": [1],
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out = paddle.unique(x)
+        expected_out = np.unique(x_data)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        x_data = np.random.random((3, 5, 5)).astype("float32")
+        x = paddle.to_tensor(x_data)
+        out, index, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        np_out, np_index, np_inverse, np_counts = np.unique(
+            x_data,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        self.assertTrue((out.numpy() == np_out).all(), True)
+        self.assertTrue((index.numpy() == np_index).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_attr_dtype(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out, indices, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            dtype="int32")
+        expected_out, np_indices, np_inverse, np_counts = np.unique(
+            x_data, return_index=True, return_inverse=True, return_counts=True)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        self.assertTrue((indices.numpy() == np_indices).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[3, 2], dtype='float64')
+            unique, inverse, counts = paddle.unique(
+                x, return_inverse=True, return_counts=True, axis=0)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
+            result = exe.run(feed={"x": x_np},
+                             fetch_list=[unique, inverse, counts])
+        np_unique, np_inverse, np_counts = np.unique(
+            x_np, return_inverse=True, return_counts=True, axis=0)
+        self.assertTrue(np.allclose(result[0], np_unique))
+        self.assertTrue(np.allclose(result[1], np_inverse))
+        self.assertTrue(np.allclose(result[2], np_counts))
+
+
+class TestUniqueError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                result = paddle.unique(x)
+
+            self.assertRaises(TypeError, test_x_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+
+        def test_return_index():
+            result = paddle.unique(x, return_index=0)
+
+        self.assertRaises(TypeError, test_return_index)
+
+        def test_return_inverse():
+            result = paddle.unique(x, return_inverse='s')
+
+        self.assertRaises(TypeError, test_return_inverse)
+
+        def test_return_counts():
+            result = paddle.unique(x, return_counts=3)
+
+        self.assertRaises(TypeError, test_return_counts)
+
+        def test_axis():
+            result = paddle.unique(x, axis='12')
+
+        def test_dtype():
+            result = paddle.unique(x, dtype='float64')
+
+        self.assertRaises(TypeError, test_axis)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 7e565ca31b219366b7ab83267b46f32e5812d983..80b94704c388824901312b5d577cb5cfd0d0c75b 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -28,6 +29,74 @@ class TestVarBase(unittest.TestCase):
         self.dtype = np.float32
         self.array = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
 
+    def test_to_tensor(self):
+        def _test_place(place):
+            with fluid.dygraph.guard():
+                x = paddle.to_tensor(
+                    1, dtype='float32', place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1.]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                self.assertEqual(x.shape, [1])
+                self.assertEqual(x.stop_gradient, False)
+                self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                x = paddle.to_tensor(
+                    (1, 2), dtype='float32', place=place, stop_gradient=False)
+                x = paddle.to_tensor(
+                    [1, 2], dtype='float32', place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1., 2.]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                self.assertEqual(x.grad, None)
+                self.assertEqual(x.shape, [2])
+                self.assertEqual(x.stop_gradient, False)
+                self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                x = paddle.to_tensor(
+                    self.array,
+                    dtype='float32',
+                    place=place,
+                    stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), self.array))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                self.assertEqual(x.shape, self.shape)
+                self.assertEqual(x.stop_gradient, False)
+                self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                y = paddle.to_tensor(x)
+                y = paddle.to_tensor(y, dtype='float64', place=place)
+                self.assertTrue(np.array_equal(y.numpy(), self.array))
+                self.assertEqual(y.dtype, core.VarDesc.VarType.FP64)
+                self.assertEqual(y.shape, self.shape)
+                self.assertEqual(y.stop_gradient, True)
+                self.assertEqual(y.type, core.VarDesc.VarType.LOD_TENSOR)
+                z = x + y
+                self.assertTrue(np.array_equal(z.numpy(), 2 * self.array))
+
+                x = paddle.to_tensor(
+                    [1 + 2j, 1 - 2j], dtype='complex64', place=place)
+                y = paddle.to_tensor(x)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j, 1 - 2j]))
+                self.assertEqual(y.dtype, 'complex64')
+                self.assertEqual(y.shape, [2])
+                self.assertEqual(y.real.stop_gradient, True)
+                self.assertEqual(y.real.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                with self.assertRaises(TypeError):
+                    paddle.to_tensor('test')
+                with self.assertRaises(TypeError):
+                    paddle.to_tensor(1, dtype='test')
+                with self.assertRaises(ValueError):
+                    paddle.to_tensor([[1], [2, 3]])
+                with self.assertRaises(ValueError):
+                    paddle.to_tensor([[1], [2, 3]], place='test')
+                with self.assertRaises(ValueError):
+                    paddle.to_tensor([[1], [2, 3]], place=1)
+
+        _test_place(core.CPUPlace())
+        if core.is_compiled_with_cuda():
+            _test_place(core.CUDAPinnedPlace())
+            _test_place(core.CUDAPlace(0))
+
     def test_to_variable(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array, name="abc")
@@ -76,7 +145,7 @@ class TestVarBase(unittest.TestCase):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array)
 
-            self.assertEqual(var.name, 'generated_var_0')
+            self.assertEqual(var.name, 'generated_tensor_0')
             var.name = 'test'
             self.assertEqual(var.name, 'test')
 
diff --git a/python/paddle/fluid/tests/unittests/test_variance_layer.py b/python/paddle/fluid/tests/unittests/test_variance_layer.py
index 569f064db8549b5f28bc751a36cbe4b379636379..b5bb3cc978a558bb52f5f56c58f107b653956a75 100644
--- a/python/paddle/fluid/tests/unittests/test_variance_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_variance_layer.py
@@ -15,65 +15,104 @@
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 
 
-class TestVarianceLayer(unittest.TestCase):
+def ref_var(x, axis=None, unbiased=True, keepdim=False):
+    ddof = 1 if unbiased else 0
+    if isinstance(axis, int):
+        axis = (axis, )
+    if axis is not None:
+        axis = tuple(axis)
+    return np.var(x, axis=axis, ddof=ddof, keepdims=keepdim)
+
+
+class TestVarAPI(unittest.TestCase):
     def setUp(self):
-        self._dtype = "float64"
-        self._input = np.random.random([2, 3, 4, 5]).astype(self._dtype)
-
-    def static(self, axis=None, keepdim=False, unbiased=True):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            data = fluid.data(
-                name="data", dtype=self._dtype, shape=[None, 3, 4, 5])
-            out = prog.current_block().create_var(
-                dtype=self._dtype, shape=[2, 3, 4, 5])
-            paddle.var(input=data,
-                       axis=axis,
-                       keepdim=keepdim,
-                       unbiased=unbiased,
-                       out=out)
-
-        exe = fluid.Executor(self._place)
-        return exe.run(feed={"data": self._input},
-                       program=prog,
-                       fetch_list=[out])[0]
-
-    def dynamic(self, axis=None, keepdim=False, unbiased=True):
-        with fluid.dygraph.guard(self._place):
-            data = fluid.dygraph.to_variable(self._input)
-            out = paddle.var(input=data,
-                             axis=axis,
-                             keepdim=keepdim,
-                             unbiased=unbiased)
-            return out.numpy()
-
-    def numpy(self, axis=None, keepdim=False, unbiased=True):
-        ddof = 1 if unbiased else 0
-        axis = tuple(axis) if isinstance(axis, list) else axis
-        return np.var(self._input, axis=axis, keepdims=keepdim, ddof=ddof)
-
-    def test_equal(self):
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self._place = place
-            self.assertTrue(np.allclose(self.numpy(), self.static()))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(axis=[0, 2]), self.dynamic(axis=[0, 2])))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(
-                        axis=[1, 3], keepdim=True),
-                    self.dynamic(
-                        axis=[1, 3], keepdim=True)))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(unbiased=False), self.dynamic(unbiased=False)))
+        self.dtype = 'float64'
+        self.shape = [1, 3, 4, 10]
+        self.axis = [1, 3]
+        self.keepdim = False
+        self.unbiased = True
+        self.set_attrs()
+        self.x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        pass
+
+    def static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape, self.dtype)
+            out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        return res[0]
+
+    def dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
+        paddle.enable_static()
+        return out.numpy()
+
+    def test_api(self):
+        out_ref = ref_var(self.x, self.axis, self.unbiased, self.keepdim)
+        out_dygraph = self.dygraph()
+        out_static = self.static()
+        for out in [out_dygraph, out_static]:
+            self.assertTrue(np.allclose(out_ref, out))
+            self.assertTrue(np.equal(out_ref.shape, out.shape).all())
+
+
+class TestVarAPI_dtype(TestVarAPI):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestVarAPI_axis_int(TestVarAPI):
+    def set_attrs(self):
+        self.axis = 2
+
+
+class TestVarAPI_axis_list(TestVarAPI):
+    def set_attrs(self):
+        self.axis = [1, 2]
+
+
+class TestVarAPI_axis_tuple(TestVarAPI):
+    def set_attrs(self):
+        self.axis = (1, 3)
+
+
+class TestVarAPI_keepdim(TestVarAPI):
+    def set_attrs(self):
+        self.keepdim = False
+
+
+class TestVarAPI_unbiased(TestVarAPI):
+    def set_attrs(self):
+        self.unbiased = False
+
+
+class TestVarAPI_alias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([10, 12], 'float32'))
+        out1 = paddle.var(x).numpy()
+        out2 = paddle.tensor.var(x).numpy()
+        out3 = paddle.tensor.stat.var(x).numpy()
+        self.assertTrue(np.allclose(out1, out2))
+        self.assertTrue(np.allclose(out1, out3))
+        paddle.enable_static()
+
+
+class TestVarError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [2, 3, 4], 'int32')
+            self.assertRaises(TypeError, paddle.var, x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 449ac959188949559056654418ace3e227c368da..6bc42f0712a1a8c9f9a0640e06042c42e7cc948f 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -21,25 +21,25 @@ from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
+import paddle.nn.functional as F
 
 CUDA_BLOCK_SIZE = 512
 
 
 class CTCForward(object):
-    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
-                 norm_by_times):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, num_classes,
+                 batch_size, blank, norm_by_times):
         self.softmax = softmax
         self.softmax_lod = softmax_lod
-        assert labels.shape[1] == 1
         self.labels = labels
         self.labels_lod = labels_lod
         self.blank = blank
         self.norm_by_times = norm_by_times
 
         self.level = 0
-        self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level])
-        assert self.batch_size == len(labels_lod[self.level])
+        self.num_classes = num_classes
+        self.batch_size = batch_size
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -163,17 +163,25 @@ class CTCForward(object):
         softmax_offset = 0
         labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = softmax_offset
-            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
-            labels_start_i = labels_offset
-            labels_end_i = labels_offset + self.labels_lod[self.level][i]
-
-            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
-            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
-            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
-                                                   labels_a_sequence)
-            softmax_offset += self.softmax_lod[self.level][i]
-            labels_offset += self.labels_lod[self.level][i]
+            if self.labels.shape[1] == 1:
+                softmax_start_i = softmax_offset
+                softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+                labels_start_i = labels_offset
+                labels_end_i = labels_offset + self.labels_lod[self.level][i]
+
+                softmax_a_sequence = self.softmax[softmax_start_i:
+                                                  softmax_end_i, :]
+                labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+                softmax_offset += self.softmax_lod[self.level][i]
+                labels_offset += self.labels_lod[self.level][i]
+            else:
+                softmax_a_sequence = self.softmax[:self.softmax_lod[i], i, :]
+                labels_a_sequence = self.labels[:self.labels_lod[i], :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+
         return self.loss
 
 
@@ -201,7 +209,8 @@ class TestWarpCTCOp(OpTest):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -223,7 +232,7 @@ class TestWarpCTCOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -237,7 +246,7 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
         self.num_classes = CUDA_BLOCK_SIZE + 2
         self.logits_lod = [[4, 1, 3, 3]]
         self.labels_lod = [[3, 1, 4, 4]]
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -267,7 +276,8 @@ class TestWarpCTCOpWithPadding(OpTest):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -317,7 +327,7 @@ class TestWarpCTCOpWithPadding(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -333,7 +343,7 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
         self.labels_lod = [[3, 1, 4, 4]]
         self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
         self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -389,5 +399,97 @@ class TestWarpCTCOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_label_len_Variable)
 
 
+class TestCTCLossAPICase(unittest.TestCase):
+    def test_functinal_api(self):
+        self.batch_size = 4
+        self.num_classes = CUDA_BLOCK_SIZE + 2
+        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+        loss_pd_mean = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='mean')
+        loss_pd_mean = loss_pd_mean.numpy()
+
+        loss_pd_sum = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='sum')
+        loss_pd_sum = loss_pd_sum.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+        loss_np_mean = (loss_np / labels_length.numpy()).mean()
+        loss_np_sum = loss_np.sum()
+
+        self.assertTrue(np.allclose(loss_pd_mean, loss_np_mean, atol=1))
+        self.assertTrue(np.allclose(loss_pd_sum, loss_np_sum, atol=1))
+
+    def test_class_api(self):
+        self.batch_size = 3
+        self.num_classes = 15
+        self.logits_length = np.array([3, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([0, 1, 2], dtype=np.int64)
+        self.blank = 0
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            1,
+            self.num_classes, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+
+        loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
+            softmax, labels, logits_length, labels_length)
+        loss_pd = loss_pd.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+
+        self.assertTrue(np.allclose(loss_pd, loss_np, atol=1))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 448751f19dbe76fdbd856d0464e36390c69aba41..21e618a46201659fe0c4e5c67d1d9a8bafd70f1b 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -62,20 +62,19 @@ class TestZerosLikeImpeartive(unittest.TestCase):
         shape = [3, 4]
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with paddle.imperative.guard(place):
-            x = paddle.imperative.to_variable(np.ones(shape))
-            for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
-                out = zeros_like(x, dtype)
-                self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
-                                 True)
-
-            out = paddle.tensor.zeros_like(x)
+        paddle.disable_static(place)
+        x = paddle.to_variable(np.ones(shape))
+        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+            out = zeros_like(x, dtype)
             self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
                              True)
 
-            out = paddle.tensor.creation.zeros_like(x)
-            self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
-                             True)
+        out = paddle.tensor.zeros_like(x)
+        self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(), True)
+
+        out = paddle.tensor.creation.zeros_like(x)
+        self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(), True)
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_op.py b/python/paddle/fluid/tests/unittests/test_zeros_op.py
index 0cf51a87cf6b844c053ab1335e20df108d16e177..23dec935507fd977f884e952451b5ea98c935893 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_op.py
@@ -39,15 +39,15 @@ class ApiZerosTest(unittest.TestCase):
         with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="float64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(Program()):
+        with paddle.static.program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -55,7 +55,7 @@ class ApiZerosTest(unittest.TestCase):
         with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -64,7 +64,7 @@ class ApiZerosTest(unittest.TestCase):
             out_np = np.zeros(shape=(1), dtype='float32')
             out = paddle.zeros(shape=[1], dtype="float32")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result = exe.run(fetch_list=[out])
             self.assertEqual((result == out_np).all(), True)
 
@@ -72,7 +72,7 @@ class ApiZerosTest(unittest.TestCase):
         with program_guard(Program()):
             zeros = fluid.layers.zeros(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -81,13 +81,13 @@ class ApiZerosTest(unittest.TestCase):
 class ApiZerosError(unittest.TestCase):
     def test_errors(self):
         def test_error1():
-            with paddle.program_guard(fluid.Program()):
+            with paddle.static.program_guard(fluid.Program()):
                 ones = fluid.layers.zeros(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error1)
 
         def test_error2():
-            with paddle.program_guard(fluid.Program()):
+            with paddle.static.program_guard(fluid.Program()):
                 ones = fluid.layers.zeros(shape=[10], dtype="int8")
 
         self.assertRaises(TypeError, test_error2)
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index b8258f3153a801dfc78db5f43325c0dce5c4b611..0de0eeb464ad700abb2144e49a822582b8653589 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -26,4 +26,5 @@ no_check_set_white_list = [
     'cross_entropy2',
     'seed',
     'amp_check_finite_and_scale',
+    'cudnn_lstm',
 ]
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index 4629089e39c9489725340df2172c53ed0661708f..581656f6cd421b12cb4c373bd6d46648704f0c1a 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -73,6 +73,7 @@ NO_FP64_CHECK_GRAD_OP_LIST = [
     'mish', \
     'transpose2', \
     'trilinear_interp', \
+    'trilinear_interp_v2', \
     'var_conv_2d', \
     'warpctc', \
     'bilateral_slice'
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index ce6868b5c70ae1218df48f899f936f57f6734582..47d62999c92d12ab4305272f60c1453cda211b09 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -15,6 +15,7 @@
 NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'affine_channel', \
     'bilinear_interp', \
+    'bilinear_interp_v2',\
     'bilinear_tensor_product', \
     'conv2d', \
     'conv3d', \
@@ -41,7 +42,10 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'unpool', \
     'yolov3_loss', \
     'inverse', \
-    'bilateral_slice'
+    'bilateral_slice',\
+    'cudnn_lstm'
 ]
 
-NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
+NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
+                                                'bilinear_interp_v2'
+                                                ]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 3078f432c3a70308c929a4fdface215fb79eebcb..95a0cb52046790e44150dd6f74733ae86a75a570 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -14,23 +14,50 @@
 
 # TODO: import framework api under this directory 
 __all__ = [
-    'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
-    'BuildStrategy', 'CompiledProgram', 'default_main_program',
-    'default_startup_program', 'create_global_var', 'create_parameter', 'Print',
-    'py_func', 'ExecutionStrategy', 'name_scope', 'ParallelExecutor',
-    'ParamAttr', 'Program', 'program_guard', 'Variable', 'WeightNormParamAttr',
-    'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace'
+    'create_global_var', 'create_parameter', 'ParamAttr', 'Variable',
+    'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', 'get_default_dtype',
+    'set_default_dtype'
+]
+
+__all__ += [
+    'grad', 'LayerList', 'load', 'save', 'prepare_context', 'to_variable',
+    'no_grad', 'ParallelEnv', 'DataParallel'
+]
+
+__all__ += [
+    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
+    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
 ]
 
 from . import random
 from .random import manual_seed
-from ..fluid.executor import Executor, global_scope, scope_guard
-from ..fluid.backward import append_backward, gradients
-from ..fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
-from ..fluid.framework import default_main_program, default_startup_program, name_scope, Program, program_guard, Variable
-from ..fluid.layers.control_flow import Print
-from ..fluid.layers.nn import py_func
-from ..fluid.parallel_executor import ParallelExecutor
-from ..fluid.param_attr import ParamAttr, WeightNormParamAttr
-from ..fluid.layers.tensor import create_global_var, create_parameter
-from ..fluid.core import CPUPlace, CUDAPlace, CUDAPinnedPlace
+from .framework import get_default_dtype
+from .framework import set_default_dtype
+
+from ..fluid.framework import Variable  #DEFINE_ALIAS
+from ..fluid.framework import ComplexVariable  #DEFINE_ALIAS
+from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
+from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
+from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
+from ..fluid.core import CPUPlace  #DEFINE_ALIAS
+from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
+from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
+from ..fluid.core import VarBase  #DEFINE_ALIAS
+
+from paddle.fluid import core  #DEFINE_ALIAS
+from ..fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
+from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
+
+from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import CosineDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 65654b59c083086967c1ef78f14b740b0779e722..41ec18ce32d3036c3db86aaa98053f59ff61f717 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -13,5 +13,70 @@
 # limitations under the License.
 
 # TODO: define framework api 
-# __all__ = ['set_default_dtype',
-#            'get_default_dtype']
+from paddle.fluid.layer_helper_base import LayerHelperBase
+from paddle.fluid.data_feeder import convert_dtype
+import numpy as np
+
+__all__ = ['set_default_dtype', 'get_default_dtype']
+
+
+def set_default_dtype(d):
+    """
+    Set default dtype. The default dtype is initially float32
+
+    Args:
+        d(string|np.dtype): the dtype to make the default. It only
+                            supports float16, float32 and float64.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_default_dtype("float32")
+
+    """
+    if isinstance(d, type):
+        if d in [np.float16, np.float32, np.float64]:
+            d = d.__name__
+        else:
+            raise TypeError(
+                "set_default_dtype only supports [float16, float32, float64] "
+                ", but received %s" % d.__name__)
+    else:
+        if d in [
+                'float16', 'float32', 'float64', u'float16', u'float32',
+                u'float64'
+        ]:
+            # this code is a little bit dangerous, since error could happen
+            # when casting no-ascii code to str in python2.
+            # but since the set itself is limited, so currently, it is good.
+            # however, jointly supporting python2 and python3, (as well as python4 maybe)
+            # may still be a long-lasting problem.
+            d = str(d)
+        else:
+            raise TypeError(
+                "set_default_dtype only supports [float16, float32, float64] "
+                ", but received %s" % str(d))
+
+    LayerHelperBase.set_default_dtype(d)
+
+
+def get_default_dtype():
+    """
+    Get the current default dtype. The default dtype is initially float32.
+
+    Args:
+        None.
+    Returns:
+        The default dtype.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.get_default_dtype()
+    """
+    return LayerHelperBase.get_default_dtype()
diff --git a/python/paddle/imperative/__init__.py b/python/paddle/imperative/__init__.py
deleted file mode 100644
index 489888a2fef39b2cca5b918a412d231784471ddc..0000000000000000000000000000000000000000
--- a/python/paddle/imperative/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# define api used to run in imperative mode 
-__all__ = [
-    'BackwardStrategy', 'enabled', 'grad', 'guard', 'LayerList', 'load', 'save',
-    'prepare_context', 'to_variable', 'TracedLayer', 'no_grad', 'ParallelEnv',
-    'ProgramTranslator', 'declarative', 'DataParallel', 'TranslatedLayer', 'jit'
-]
-
-__all__ += [
-    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
-]
-
-from paddle.fluid import core
-from ..fluid.dygraph.base import enabled, guard, no_grad, to_variable, grad
-from ..fluid.dygraph.checkpoint import load_dygraph as load
-from ..fluid.dygraph.checkpoint import save_dygraph as save
-from ..fluid.dygraph.parallel import prepare_context, ParallelEnv, DataParallel
-from ..fluid.dygraph.jit import TracedLayer, declarative
-from ..fluid.dygraph import ProgramTranslator
-from . import jit
-
-from ..fluid.dygraph.learning_rate_scheduler import NoamDecay, PiecewiseDecay, NaturalExpDecay, ExponentialDecay, \
-        InverseTimeDecay, PolynomialDecay, CosineDecay
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/incubate/complex/tensor/linalg.py b/python/paddle/incubate/complex/tensor/linalg.py
index 3badf36280e27c9d7962a2b7b3fff596fd0e8cb3..946a0fd5534d13166706523675c93ef1d01cfa54 100644
--- a/python/paddle/incubate/complex/tensor/linalg.py
+++ b/python/paddle/incubate/complex/tensor/linalg.py
@@ -56,20 +56,20 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
                 # [1.+5.j 5.+9.j]         
     """
     # x = a + bi, y = c + di
-    # mm(x, y) = mm(a, c) - mm(b, d) + (mm(a, d) + mm(b, c))i
+    # P1 = ac; P2 = (a + b)(c + d); P3 = bd; then mm(x, y) = (P1-P3) + (P2-P1-P3)j
     complex_variable_exists([x, y], "matmul")
     a, b = (x.real, x.imag) if is_complex(x) else (x, None)
     c, d = (y.real, y.imag) if is_complex(y) else (y, None)
-    ac = layers.matmul(a, c, transpose_x, transpose_y, alpha, name)
+    P1 = layers.matmul(a, c, transpose_x, transpose_y, alpha, name)
     if is_real(b) and is_real(d):
-        bd = layers.matmul(b, d, transpose_x, transpose_y, alpha, name)
-        real = ac - bd
-        imag = layers.matmul(a, d, transpose_x, transpose_y, alpha, name) + \
-               layers.matmul(b, c, transpose_x, transpose_y, alpha, name)
+        P2 = layers.matmul(a + b, c + d, transpose_x, transpose_y, alpha, name)
+        P3 = layers.matmul(b, d, transpose_x, transpose_y, alpha, name)
+        real = P1 - P3
+        imag = P2 - P1 - P3
     elif is_real(b):
-        real = ac
+        real = P1
         imag = layers.matmul(b, c, transpose_x, transpose_y, alpha, name)
     else:
-        real = ac
+        real = P1
         imag = layers.matmul(a, d, transpose_x, transpose_y, alpha, name)
     return ComplexVariable(real, imag)
diff --git a/python/paddle/incubate/complex/tensor/math.py b/python/paddle/incubate/complex/tensor/math.py
index 5c26d6da8d9bb002a117ee40e0ce209c3fa0db9f..465e4887a1f8a8dc1d53afac8869f0b55776f3d2 100644
--- a/python/paddle/incubate/complex/tensor/math.py
+++ b/python/paddle/incubate/complex/tensor/math.py
@@ -261,8 +261,8 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
             
             case1 = np.random.randn(3, 10, 10).astype('float64') + 1j * np.random.randn(3, 10, 10).astype('float64')
             
-            paddle.enable_imperative()
-            case1 = paddle.imperative.to_variable(case1)
+            paddle.disable_static()
+            case1 = paddle.to_tensor(case1)
             data1 = paddle.complex.trace(case1, offset=1, axis1=1, axis2=2) # data1.shape = [3]
     """
     complex_variable_exists([x], "trace")
@@ -330,8 +330,8 @@ def sum(input, dim=None, keep_dim=False, name=None):
 
     """
     complex_variable_exists([input], "sum")
-    real = math.sum(input.real, dim=dim, keep_dim=keep_dim, name=name)
-    imag = math.sum(input.imag, dim=dim, keep_dim=keep_dim, name=name)
+    real = math.sum(input.real, axis=dim, keepdim=keep_dim, name=name)
+    imag = math.sum(input.imag, axis=dim, keepdim=keep_dim, name=name)
     return ComplexVariable(real, imag)
 
 
diff --git a/python/paddle/incubate/hapi/__init__.py b/python/paddle/incubate/hapi/__init__.py
index a6b5faef57ca95188f0759f53753177e4f5946f3..c0361fa33246ff3315a107c520972ca6bebc8168 100644
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/incubate/hapi/__init__.py
@@ -20,7 +20,6 @@ from . import download
 from . import model
 from .model import *
 
-from . import metrics
 from . import datasets
 from . import distributed
 from . import vision
@@ -39,7 +38,6 @@ __all__ = [
     'datasets',
     'distributed',
     'download',
-    'metrics',
     'vision',
     'text',
     'utils',
diff --git a/python/paddle/incubate/hapi/callbacks.py b/python/paddle/incubate/hapi/callbacks.py
index 741552511f9fdc93d9e370fc7d45f9d84a1d4392..0804708210a9749813e195a8b5579b339986acd6 100644
--- a/python/paddle/incubate/hapi/callbacks.py
+++ b/python/paddle/incubate/hapi/callbacks.py
@@ -295,8 +295,8 @@ class ProgBarLogger(Callback):
             import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
 
-            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
-            labels = [hapi.Input('label', [None, 1], 'int64')]
+            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [hapi.Input([None, 1], 'int64', 'label')]
 
             train_dataset = hapi.datasets.MNIST(mode='train')
 
@@ -305,8 +305,8 @@ class ProgBarLogger(Callback):
 
             optim = fluid.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
-                        loss_function=paddle.nn.CrossEntropyLoss(),
-                        metrics=hapi.metrics.Accuracy())
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
 
             callback = hapi.callbacks.ProgBarLogger(log_freq=10)
             model.fit(train_dataset, batch_size=64, callbacks=callback)
@@ -431,8 +431,8 @@ class ModelCheckpoint(Callback):
             import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
 
-            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
-            labels = [hapi.Input('label', [None, 1], 'int64')]
+            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [hapi.Input([None, 1], 'int64', 'label')]
 
             train_dataset = hapi.datasets.MNIST(mode='train')
 
@@ -441,8 +441,8 @@ class ModelCheckpoint(Callback):
 
             optim = fluid.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
-                        loss_function=paddle.nn.CrossEntropyLoss(),
-                        metrics=hapi.metrics.Accuracy())
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
 
             callback = hapi.callbacks.ModelCheckpoint(save_dir='./temp')
             model.fit(train_dataset, batch_size=64, callbacks=callback)
diff --git a/python/paddle/incubate/hapi/datasets/__init__.py b/python/paddle/incubate/hapi/datasets/__init__.py
index fc5df6401992def4bc37329794e534a832924da3..a88b0e6bbf1975d97bfeb68025b978ce877c6baf 100644
--- a/python/paddle/incubate/hapi/datasets/__init__.py
+++ b/python/paddle/incubate/hapi/datasets/__init__.py
@@ -15,11 +15,41 @@
 from . import folder
 from . import mnist
 from . import flowers
+from . import cifar
+from . import voc2012
+from . import conll05
+from . import imdb
+from . import imikolov
+from . import movielens
+from . import movie_reviews
+from . import uci_housing
+from . import wmt14
+from . import wmt16
 
 from .folder import *
 from .mnist import *
 from .flowers import *
+from .cifar import *
+from .voc2012 import *
+from .conll05 import *
+from .imdb import *
+from .imikolov import *
+from .movielens import *
+from .movie_reviews import *
+from .uci_housing import *
+from .wmt14 import *
+from .wmt16 import *
 
 __all__ = folder.__all__ \
-        + mnist.__all__ \
-        + flowers.__all__
+          + mnist.__all__ \
+          + flowers.__all__ \
+          + cifar.__all__ \
+          + voc2012.__all__ \
+          + conll05.__all__ \
+          + imdb.__all__ \
+          + imikolov.__all__ \
+          + movielens.__all__ \
+          + movie_reviews.__all__ \
+          + uci_housing.__all__ \
+          + wmt14.__all__ \
+          + wmt16.__all__
diff --git a/python/paddle/incubate/hapi/datasets/cifar.py b/python/paddle/incubate/hapi/datasets/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..adfa786e615368ba90dab154924678de79104b55
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/cifar.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import tarfile
+import numpy as np
+import six
+from six.moves import cPickle as pickle
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Cifar10', 'Cifar100']
+
+URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+MODE_FLAG_MAP = {
+    'train10': 'data_batch',
+    'test10': 'test_batch',
+    'train100': 'train',
+    'test100': 'test'
+}
+
+
+class Cifar10(Dataset):
+    """
+    Implementation of `Cifar-10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_
+    dataset, which has 10 categories.
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' mode. Default 'train'.
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of cifar-10 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Cifar10
+	    from paddle.incubate.hapi.vision.transforms import Normalize
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+		    self.fc = paddle.nn.Linear(3072, 10, act='softmax')
+
+		def forward(self, image, label):
+		    image = paddle.reshape(image, (3, -1))
+		    return self.fc(image), label
+
+	    paddle.disable_static()
+
+	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
+				std=[0.5, 0.5, 0.5])
+	    cifar10 = Cifar10(mode='train', transform=normalize)
+
+	    for i in range(10):
+		image, label = cifar10[i]
+		image = paddle.to_tensor(image)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'train', 'test'], \
+            "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self._init_url_md5_flag()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, self.data_url, self.data_md5, 'cifar', download)
+
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_data()
+
+    def _init_url_md5_flag(self):
+        self.data_url = CIFAR10_URL
+        self.data_md5 = CIFAR10_MD5
+        self.flag = MODE_FLAG_MAP[self.mode + '10']
+
+    def _load_data(self):
+        self.data = []
+        with tarfile.open(self.data_file, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if self.flag in each_item.name)
+
+            for name in names:
+                if six.PY2:
+                    batch = pickle.load(f.extractfile(name))
+                else:
+                    batch = pickle.load(f.extractfile(name), encoding='bytes')
+
+                data = batch[six.b('data')]
+                labels = batch.get(
+                    six.b('labels'), batch.get(six.b('fine_labels'), None))
+                assert labels is not None
+                for sample, label in six.moves.zip(data, labels):
+                    self.data.append((sample, label))
+
+    def __getitem__(self, idx):
+        image, label = self.data[idx]
+        if self.transform is not None:
+            image = self.transform(image)
+        return image, label
+
+    def __len__(self):
+        return len(self.data)
+
+
+class Cifar100(Cifar10):
+    """
+    Implementation of `Cifar-100 <https://www.cs.toronto.edu/~kriz/cifar.html>`_
+    dataset, which has 100 categories.
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' mode. Default 'train'.
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of cifar-100 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Cifar100
+	    from paddle.incubate.hapi.vision.transforms import Normalize
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+		    self.fc = paddle.nn.Linear(3072, 100, act='softmax')
+
+		def forward(self, image, label):
+		    image = paddle.reshape(image, (3, -1))
+		    return self.fc(image), label
+
+	    paddle.disable_static()
+
+	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
+				std=[0.5, 0.5, 0.5])
+	    cifar100 = Cifar100(mode='train', transform=normalize)
+
+	    for i in range(10):
+		image, label = cifar100[i]
+		image = paddle.to_tensor(image)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        super(Cifar100, self).__init__(data_file, mode, transform, download)
+
+    def _init_url_md5_flag(self):
+        self.data_url = CIFAR100_URL
+        self.data_md5 = CIFAR100_MD5
+        self.flag = MODE_FLAG_MAP[self.mode + '100']
diff --git a/python/paddle/incubate/hapi/datasets/conll05.py b/python/paddle/incubate/hapi/datasets/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..094e3559335363524c4ae893f70294a4afaa7037
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/conll05.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import gzip
+import tarfile
+import numpy as np
+import six
+from six.moves import cPickle as pickle
+
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['Conll05st']
+
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+class Conll05st(Dataset):
+    """
+    Implementation of `Conll05st <https://www.cs.upc.edu/~srlconll/soft.html>`_
+    test dataset.
+
+    Note: only support download test dataset automatically for that
+          only test dataset of Conll05st is public.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        word_dict_file(str): path to word dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        verb_dict_file(str): path to verb dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        target_dict_file(str): path to target dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        emb_file(str): path to embedding dictionary file, only used for
+            :code:`get_embedding` can be set None if :attr:`download` is
+            True. Default None
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file`
+            :attr:`target_dict_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of conll05st dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Conll05st
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, pred_idx, mark, label):
+		    return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)
+
+	    paddle.disable_static()
+
+	    conll05st = Conll05st()
+
+	    for i in range(10):
+		pred_idx, mark, label= conll05st[i][-3:]
+		pred_idx = paddle.to_tensor(pred_idx)
+		mark = paddle.to_tensor(mark)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		pred_idx, mark, label= model(pred_idx, mark, label)
+		print(pred_idx.numpy(), mark.numpy(), label.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 word_dict_file=None,
+                 verb_dict_file=None,
+                 target_dict_file=None,
+                 emb_file=None,
+                 download=True):
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'conll05st', download)
+
+        self.word_dict_file = word_dict_file
+        if self.word_dict_file is None:
+            assert download, "word_dict_file is not set and downloading automatically is disabled"
+            self.word_dict_file = _check_exists_and_download(
+                word_dict_file, WORDDICT_URL, WORDDICT_MD5, 'conll05st',
+                download)
+
+        self.verb_dict_file = verb_dict_file
+        if self.verb_dict_file is None:
+            assert download, "verb_dict_file is not set and downloading automatically is disabled"
+            self.verb_dict_file = _check_exists_and_download(
+                verb_dict_file, VERBDICT_URL, VERBDICT_MD5, 'conll05st',
+                download)
+
+        self.target_dict_file = target_dict_file
+        if self.target_dict_file is None:
+            assert download, "target_dict_file is not set and downloading automatically is disabled"
+            self.target_dict_file = _check_exists_and_download(
+                target_dict_file, TRGDICT_URL, TRGDICT_MD5, 'conll05st',
+                download)
+
+        self.word_dict = self._load_dict(self.word_dict_file)
+        self.predicate_dict = self._load_dict(self.verb_dict_file)
+        self.label_dict = self._load_label_dict(self.target_dict_file)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_label_dict(self, filename):
+        d = dict()
+        tag_dict = set()
+        with open(filename, 'r') as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                if line.startswith("B-"):
+                    tag_dict.add(line[2:])
+                elif line.startswith("I-"):
+                    tag_dict.add(line[2:])
+            index = 0
+            for tag in tag_dict:
+                d["B-" + tag] = index
+                index += 1
+                d["I-" + tag] = index
+                index += 1
+            d["O"] = index
+        return d
+
+    def _load_dict(self, filename):
+        d = dict()
+        with open(filename, 'r') as f:
+            for i, line in enumerate(f):
+                d[line.strip()] = i
+        return d
+
+    def _load_anno(self):
+        tf = tarfile.open(self.data_file)
+        wf = tf.extractfile(
+            "conll05st-release/test.wsj/words/test.wsj.words.gz")
+        pf = tf.extractfile(
+            "conll05st-release/test.wsj/props/test.wsj.props.gz")
+        self.sentences = []
+        self.predicates = []
+        self.labels = []
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in zip(words_file, props_file):
+                word = cpt.to_text(word.strip())
+                label = cpt.to_text(label.strip().split())
+
+                if len(label) == 0:  # end of sentence
+                    for i in range(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            self.sentences.append(sentences)
+                            self.predicates.append(verb_list[i])
+                            self.labels.append(lbl_seq)
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    def __getitem__(self, idx):
+        sentence = self.sentences[idx]
+        predicate = self.predicates[idx]
+        labels = self.labels[idx]
+
+        sen_len = len(sentence)
+
+        verb_index = labels.index('B-V')
+        mark = [0] * len(labels)
+        if verb_index > 0:
+            mark[verb_index - 1] = 1
+            ctx_n1 = sentence[verb_index - 1]
+        else:
+            ctx_n1 = 'bos'
+
+        if verb_index > 1:
+            mark[verb_index - 2] = 1
+            ctx_n2 = sentence[verb_index - 2]
+        else:
+            ctx_n2 = 'bos'
+
+        mark[verb_index] = 1
+        ctx_0 = sentence[verb_index]
+
+        if verb_index < len(labels) - 1:
+            mark[verb_index + 1] = 1
+            ctx_p1 = sentence[verb_index + 1]
+        else:
+            ctx_p1 = 'eos'
+
+        if verb_index < len(labels) - 2:
+            mark[verb_index + 2] = 1
+            ctx_p2 = sentence[verb_index + 2]
+        else:
+            ctx_p2 = 'eos'
+
+        word_idx = [self.word_dict.get(w, UNK_IDX) for w in sentence]
+
+        ctx_n2_idx = [self.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+        ctx_n1_idx = [self.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+        ctx_0_idx = [self.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+        ctx_p1_idx = [self.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+        ctx_p2_idx = [self.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+        pred_idx = [self.predicate_dict.get(predicate)] * sen_len
+        label_idx = [self.label_dict.get(w) for w in labels]
+
+        return (np.array(word_idx), np.array(ctx_n2_idx), np.array(ctx_n1_idx),
+                np.array(ctx_0_idx), np.array(ctx_p1_idx), np.array(ctx_p2_idx),
+                np.array(pred_idx), np.array(mark), np.array(label_idx))
+
+    def __len__(self):
+        return len(self.sentences)
+
+    def get_dict(self):
+        """
+        Get the word, verb and label dictionary of Wikipedia corpus.
+        """
+        return self.word_dict, self.predicate_dict, self.label_dict
+
+    def get_embedding(self):
+        return self.emb_file
diff --git a/python/paddle/incubate/hapi/datasets/flowers.py b/python/paddle/incubate/hapi/datasets/flowers.py
index 6f56cc82c1cba800002d82cc8a2bd5ddae619f9e..141d2a53b577b8c9be9ac153a36c5b2fa51ded77 100644
--- a/python/paddle/incubate/hapi/datasets/flowers.py
+++ b/python/paddle/incubate/hapi/datasets/flowers.py
@@ -36,12 +36,13 @@ SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
 # and trnid is the flag of train data. But test data is more than train data.
 # So we exchange the train data and test data.
-MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': "valid"}
+MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': 'valid'}
 
 
 class Flowers(Dataset):
     """
-    Implement of flowers dataset
+    Implementation of `Flowers <https://www.robots.ox.ac.uk/~vgg/data/flowers/>`_
+    dataset
 
     Args:
         data_file(str): path to data file, can be set None if
@@ -51,9 +52,9 @@ class Flowers(Dataset):
         setid_file(str): path to subset index file, can be set
             None if :attr:`download` is True. Default None
         mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
 
     Examples:
         
@@ -82,19 +83,19 @@ class Flowers(Dataset):
 
         self.data_file = data_file
         if self.data_file is None:
-            assert download, "data_file not set and auto download disabled"
+            assert download, "data_file is not set and downloading automatically is disabled"
             self.data_file = _check_exists_and_download(
                 data_file, DATA_URL, DATA_MD5, 'flowers', download)
 
         self.label_file = label_file
         if self.label_file is None:
-            assert download, "label_file not set and auto download disabled"
+            assert download, "label_file is not set and downloading automatically is disabled"
             self.label_file = _check_exists_and_download(
                 label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
 
         self.setid_file = setid_file
         if self.setid_file is None:
-            assert download, "setid_file not set and auto download disabled"
+            assert download, "setid_file is not set and downloading automatically is disabled"
             self.setid_file = _check_exists_and_download(
                 setid_file, SETID_URL, SETID_MD5, 'flowers', download)
 
diff --git a/python/paddle/incubate/hapi/datasets/imdb.py b/python/paddle/incubate/hapi/datasets/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d166bc784a382ac5ae70491d3e8061ad1d1e9f
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/imdb.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import re
+import six
+import string
+import tarfile
+import numpy as np
+import collections
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Imdb']
+
+URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+class Imdb(Dataset):
+    """
+    Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' 'test' mode. Default 'train'.
+        cutoff(int): cutoff number for building word dictionary. Default 150.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of IMDB dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Imdb
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, doc, label):
+		    return paddle.sum(doc), label
+
+	    paddle.disable_static()
+
+	    imdb = Imdb(mode='train')
+
+	    for i in range(10):
+		doc, label = imdb[i]
+		doc = paddle.to_tensor(doc)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(doc, label)
+		print(doc.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self, data_file=None, mode='train', cutoff=150, download=True):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'imdb', download)
+
+        # Build a word dictionary from the corpus
+        self.word_idx = self._build_work_dict(cutoff)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _build_work_dict(self, cutoff):
+        word_freq = collections.defaultdict(int)
+        pattern = re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
+        for doc in self._tokenize(pattern):
+            for word in doc:
+                word_freq[word] += 1
+
+        # Not sure if we should prune less-frequent words here.
+        word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
+
+        dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*dictionary))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+        word_idx['<unk>'] = len(words)
+        return word_idx
+
+    def _tokenize(self, pattern):
+        data = []
+        with tarfile.open(self.data_file) as tarf:
+            tf = tarf.next()
+            while tf != None:
+                if bool(pattern.match(tf.name)):
+                    # newline and punctuations removal and ad-hoc tokenization.
+                    data.append(
+                        tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
+                        .translate(None, six.b(string.punctuation)).lower(
+                        ).split())
+                tf = tarf.next()
+
+        return data
+
+    def _load_anno(self):
+        pos_pattern = re.compile("aclImdb/{}/pos/.*\.txt$".format(self.mode))
+        neg_pattern = re.compile("aclImdb/{}/neg/.*\.txt$".format(self.mode))
+
+        UNK = self.word_idx['<unk>']
+
+        self.docs = []
+        self.labels = []
+        for doc in self._tokenize(pos_pattern):
+            self.docs.append([self.word_idx.get(w, UNK) for w in doc])
+            self.labels.append(0)
+        for doc in self._tokenize(neg_pattern):
+            self.docs.append([self.word_idx.get(w, UNK) for w in doc])
+            self.labels.append(1)
+
+    def __getitem__(self, idx):
+        return (np.array(self.docs[idx]), np.array([self.labels[idx]]))
+
+    def __len__(self):
+        return len(self.docs)
diff --git a/python/paddle/incubate/hapi/datasets/imikolov.py b/python/paddle/incubate/hapi/datasets/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6ad43b506265ee8c9c8617a87eba5a041632bd
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/imikolov.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+import tarfile
+import numpy as np
+import collections
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Imikolov']
+
+URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class Imikolov(Dataset):
+    """
+    Implementation of imikolov dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
+        window_size(int): sliding window size for 'NGRAM' data. Default -1.
+        mode(str): 'train' 'test' mode. Default 'train'.
+        min_word_freq(int): minimal word frequence for building word dictionary. Default 50.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of imikolov dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Imikolov
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src, trg):
+		    return paddle.sum(src), paddle.sum(trg)
+
+	    paddle.disable_static()
+
+	    imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)
+
+	    for i in range(10):
+		src, trg = imikolov[i]
+		src = paddle.to_tensor(src)
+		trg = paddle.to_tensor(trg)
+
+		model = SimpleNet()
+		src, trg = model(src, trg)
+		print(src.numpy().shape, trg.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 data_type='NGRAM',
+                 window_size=-1,
+                 mode='train',
+                 min_word_freq=50,
+                 download=True):
+        assert data_type.upper() in ['NGRAM', 'SEQ'], \
+            "data type should be 'NGRAM', 'SEQ', but got {}".format(data_type)
+        self.data_type = data_type.upper()
+
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.window_size = window_size
+        self.min_word_freq = min_word_freq
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'imikolov', download)
+
+        # Build a word dictionary from the corpus
+        self.word_idx = self._build_work_dict(min_word_freq)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def word_count(self, f, word_freq=None):
+        if word_freq is None:
+            word_freq = collections.defaultdict(int)
+
+        for l in f:
+            for w in l.strip().split():
+                word_freq[w] += 1
+            word_freq['<s>'] += 1
+            word_freq['<e>'] += 1
+
+        return word_freq
+
+    def _build_work_dict(self, cutoff):
+        train_filename = './simple-examples/data/ptb.train.txt'
+        test_filename = './simple-examples/data/ptb.valid.txt'
+        with tarfile.open(self.data_file) as tf:
+            trainf = tf.extractfile(train_filename)
+            testf = tf.extractfile(test_filename)
+            word_freq = self.word_count(testf, self.word_count(trainf))
+            if '<unk>' in word_freq:
+                # remove <unk> for now, since we will set it as last index
+                del word_freq['<unk>']
+
+            word_freq = [
+                x for x in six.iteritems(word_freq) if x[1] > self.min_word_freq
+            ]
+
+            word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+            words, _ = list(zip(*word_freq_sorted))
+            word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+            word_idx['<unk>'] = len(words)
+
+        return word_idx
+
+    def _load_anno(self):
+        self.data = []
+        with tarfile.open(self.data_file) as tf:
+            filename = './simple-examples/data/ptb.{}.txt'.format(self.mode)
+            f = tf.extractfile(filename)
+
+            UNK = self.word_idx['<unk>']
+            for l in f:
+                if self.data_type == 'NGRAM':
+                    assert self.window_size > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= self.window_size:
+                        l = [self.word_idx.get(w, UNK) for w in l]
+                        for i in six.moves.range(self.window_size, len(l) + 1):
+                            self.data.append(tuple(l[i - self.window_size:i]))
+                elif self.data_type == 'SEQ':
+                    l = l.strip().split()
+                    l = [self.word_idx.get(w, UNK) for w in l]
+                    src_seq = [self.word_idx['<s>']] + l
+                    trg_seq = l + [self.word_idx['<e>']]
+                    if self.window_size > 0 and len(src_seq) > self.window_size:
+                        continue
+                    self.data.append((src_seq, trg_seq))
+                else:
+                    assert False, 'Unknow data type'
+
+    def __getitem__(self, idx):
+        return tuple([np.array(d) for d in self.data[idx]])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/mnist.py b/python/paddle/incubate/hapi/datasets/mnist.py
index bd48ca1c9668b40ac0379bfeda11a5c056f9fd44..ed046e5a1d9bbcc33f3148c6ecde8a349e478cb0 100644
--- a/python/paddle/incubate/hapi/datasets/mnist.py
+++ b/python/paddle/incubate/hapi/datasets/mnist.py
@@ -38,7 +38,7 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 
 class MNIST(Dataset):
     """
-    Implement of MNIST dataset
+    Implementation of `MNIST <http://yann.lecun.com/exdb/mnist/>`_ dataset
 
     Args:
         image_path(str): path to image file, can be set None if
@@ -48,9 +48,8 @@ class MNIST(Dataset):
         chw_format(bool): If set True, the output shape is [1, 28, 28],
             otherwise, output shape is [1, 784]. Default True.
         mode(str): 'train' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
+        download(bool): whether to download dataset automatically if
+            :attr:`image_path` :attr:`label_path` is not set. Default True
 
     Returns:
         Dataset: MNIST Dataset.
@@ -82,7 +81,7 @@ class MNIST(Dataset):
         self.chw_format = chw_format
         self.image_path = image_path
         if self.image_path is None:
-            assert download, "image_path not set and auto download disabled"
+            assert download, "image_path is not set and downloading automatically is disabled"
             image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
             image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
             self.image_path = _check_exists_and_download(
@@ -90,9 +89,9 @@ class MNIST(Dataset):
 
         self.label_path = label_path
         if self.label_path is None:
-            assert download, "label_path not set and auto download disabled"
-            label_url = TRAIN_LABEL_URL if mode == 'train' else TEST_LABEL_URL
-            label_md5 = TRAIN_LABEL_MD5 if mode == 'train' else TEST_LABEL_MD5
+            assert download, "label_path is not set and downloading automatically is disabled"
+            label_url = TRAIN_LABEL_URL if self.mode == 'train' else TEST_LABEL_URL
+            label_md5 = TRAIN_LABEL_MD5 if self.mode == 'train' else TEST_LABEL_MD5
             self.label_path = _check_exists_and_download(
                 label_path, label_url, label_md5, 'mnist', download)
 
diff --git a/python/paddle/incubate/hapi/datasets/movie_reviews.py b/python/paddle/incubate/hapi/datasets/movie_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf0684ebcd315807b9dc736c5481383073e5ba8
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/movie_reviews.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import numpy as np
+import collections
+import nltk
+from nltk.corpus import movie_reviews
+import zipfile
+from functools import cmp_to_key
+from itertools import chain
+
+import paddle
+from paddle.io import Dataset
+
+__all__ = ['MovieReviews']
+
+URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
+MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
+
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+class MovieReviews(Dataset):
+    """
+    Implementation of `NLTK movie reviews <http://www.nltk.org/nltk_data/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' 'test' mode. Default 'train'.
+        download(bool): whether auto download cifar dataset if
+            :attr:`data_file` unset. Default True.
+
+    Returns:
+        Dataset: instance of movie reviews dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import MovieReviews
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, word, category):
+		    return paddle.sum(word), category
+
+	    paddle.disable_static()
+
+	    movie_reviews = MovieReviews(mode='train')
+
+	    for i in range(10):
+		word_list, category = movie_reviews[i]
+		word_list = paddle.to_tensor(word_list)
+		category = paddle.to_tensor(category)
+
+		model = SimpleNet()
+		word_list, category = model(word_list, category)
+		print(word_list.numpy().shape, category.numpy())
+
+    """
+
+    def __init__(self, mode='train'):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self._download_data_if_not_yet()
+
+        # read dataset into memory
+        self._load_sentiment_data()
+
+    def _get_word_dict(self):
+        """
+	Sorted the words by the frequency of words which occur in sample
+	:return:
+	    words_freq_sorted
+	"""
+        words_freq_sorted = list()
+        word_freq_dict = collections.defaultdict(int)
+
+        for category in movie_reviews.categories():
+            for field in movie_reviews.fileids(category):
+                for words in movie_reviews.words(field):
+                    word_freq_dict[words] += 1
+        words_sort_list = list(six.iteritems(word_freq_dict))
+        words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
+        for index, word in enumerate(words_sort_list):
+            words_freq_sorted.append((word[0], index))
+        return words_freq_sorted
+
+    def _sort_files(self):
+        """
+	Sorted the sample for cross reading the sample
+	:return:
+	    files_list
+	"""
+        files_list = list()
+        neg_file_list = movie_reviews.fileids('neg')
+        pos_file_list = movie_reviews.fileids('pos')
+        files_list = list(
+            chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
+        return files_list
+
+    def _load_sentiment_data(self):
+        """
+	Load the data set
+	:return:
+	    data_set
+	"""
+        self.data = []
+        words_ids = dict(self._get_word_dict())
+        for sample_file in self._sort_files():
+            words_list = list()
+            category = 0 if 'neg' in sample_file else 1
+            for word in movie_reviews.words(sample_file):
+                words_list.append(words_ids[word.lower()])
+            self.data.append((words_list, category))
+
+    def _download_data_if_not_yet(self):
+        """
+	Download the data set, if the data set is not download.
+	"""
+        try:
+            # download and extract movie_reviews.zip
+            paddle.dataset.common.download(
+                URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
+            path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
+            filename = os.path.join(path, 'movie_reviews.zip')
+            zip_file = zipfile.ZipFile(filename)
+            zip_file.extractall(path)
+            zip_file.close()
+            # make sure that nltk can find the data
+            if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+                nltk.data.path.append(paddle.dataset.common.DATA_HOME)
+            movie_reviews.categories()
+        except LookupError:
+            print("Downloading movie_reviews data set, please wait.....")
+            nltk.download(
+                'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+            print("Download data set success.....")
+            print("Path is " + nltk.data.find('corpora/movie_reviews').path)
+
+    def __getitem__(self, idx):
+        if self.mode == 'test':
+            idx += NUM_TRAINING_INSTANCES
+        data = self.data[idx]
+        return np.array(data[0]), np.array(data[1])
+
+    def __len__(self):
+        if self.mode == 'train':
+            return NUM_TRAINING_INSTANCES
+        else:
+            return NUM_TOTAL_INSTANCES - NUM_TRAINING_INSTANCES
diff --git a/python/paddle/incubate/hapi/datasets/movielens.py b/python/paddle/incubate/hapi/datasets/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..228e9dc6d477cf539683963dc6ddaa3c02c8fe95
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/movielens.py
@@ -0,0 +1,219 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import zipfile
+import re
+import random
+import functools
+import six
+
+import paddle
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['Movielens']
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'https://dataset.bj.bcebos.com/movielens%2Fml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self, categories_dict, movie_title_dict):
+        """
+        Get information from a movie.
+        """
+        return [[self.index], [categories_dict[c] for c in self.categories],
+                [movie_title_dict[w.lower()] for w in self.title.split()]]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [[self.index], [0 if self.is_male else 1], [self.age],
+                [self.job_id]]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+class Movielens(Dataset):
+    """
+    Implementation of `Movielens 1-M <https://grouplens.org/datasets/movielens/1m/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        test_ratio(float): split ratio for test sample. Default 0.1.
+        rand_seed(int): random seed. Default 0.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of Movielens 1-M dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Movielens
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, category, title, rating):
+		    return paddle.sum(category), paddle.sum(title), paddle.sum(rating)
+
+	    paddle.disable_static()
+
+	    movielens = Movielens(mode='train')
+
+	    for i in range(10):
+		category, title, rating = movielens[i][-3:]
+		category = paddle.to_tensor(category)
+		title = paddle.to_tensor(title)
+		rating = paddle.to_tensor(rating)
+
+		model = SimpleNet()
+		category, title, rating = model(category, title, rating)
+		print(category.numpy().shape, title.numpy().shape, rating.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 test_ratio=0.1,
+                 rand_seed=0,
+                 download=True):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'sentiment', download)
+
+        self.test_ratio = test_ratio
+        self.rand_seed = rand_seed
+
+        np.random.seed(rand_seed)
+        self._load_meta_info()
+        self._load_data()
+
+    def _load_meta_info(self):
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        self.movie_info = dict()
+        self.movie_title_dict = dict()
+        self.categories_dict = dict()
+        self.user_info = dict()
+        with zipfile.ZipFile(self.data_file) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        self.movie_info[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                for i, w in enumerate(title_word_set):
+                    self.movie_title_dict[w] = i
+
+                for i, c in enumerate(categories_set):
+                    self.categories_dict[c] = i
+
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        self.user_info[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+
+    def _load_data(self):
+        self.data = []
+        is_test = self.mode == 'test'
+        with zipfile.ZipFile(self.data_file) as package:
+            with package.open('ml-1m/ratings.dat') as rating:
+                for line in rating:
+                    line = cpt.to_text(line, encoding='latin')
+                    if (np.random.random() < self.test_ratio) == is_test:
+                        uid, mov_id, rating, _ = line.strip().split("::")
+                        uid = int(uid)
+                        mov_id = int(mov_id)
+                        rating = float(rating) * 2 - 5.0
+
+                        mov = self.movie_info[mov_id]
+                        usr = self.user_info[uid]
+                        self.data.append(usr.value() + \
+                                         mov.value(self.categories_dict, self.movie_title_dict) + \
+                                         [[rating]])
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        return tuple([np.array(d) for d in data])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/uci_housing.py b/python/paddle/incubate/hapi/datasets/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f2c4a5bb5d9d60ba1316e3e2a5f174df94fe99
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/uci_housing.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+import numpy as np
+
+import paddle.dataset.common
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["UCIHousing"]
+
+URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT'
+]
+
+
+class UCIHousing(Dataset):
+    """
+    Implementation of `UCI housing <https://archive.ics.uci.edu/ml/datasets/Housing>`_
+    dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of UCI housing dataset.
+
+    Examples:
+        
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import UCIHousing
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, feature, target):
+		    return paddle.sum(feature), target
+
+	    paddle.disable_static()
+
+	    uci_housing = UCIHousing(mode='train')
+
+	    for i in range(10):
+		feature, target = uci_housing[i]
+		feature = paddle.to_tensor(feature)
+		target = paddle.to_tensor(target)
+
+		model = SimpleNet()
+		feature, target = model(feature, target)
+		print(feature.numpy().shape, target.numpy())
+
+    """
+
+    def __init__(self, data_file=None, mode='train', download=True):
+        assert mode.lower() in ['train', 'test'], \
+                "mode should be 'train' or 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'uci_housing', download)
+
+        # read dataset into memory
+        self._load_data()
+
+    def _load_data(self, feature_num=14, ratio=0.8):
+        data = np.fromfile(self.data_file, sep=' ')
+        data = data.reshape(data.shape[0] // feature_num, feature_num)
+        maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+            axis=0) / data.shape[0]
+        for i in six.moves.range(feature_num - 1):
+            data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+        offset = int(data.shape[0] * ratio)
+        if self.mode == 'train':
+            self.data = data[:offset]
+        elif self.mode == 'test':
+            self.data = data[offset:]
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        return np.array(data[:-1]), np.array(data[-1:])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/voc2012.py b/python/paddle/incubate/hapi/datasets/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..1811c455db530710a0559c077975ab08d6a94ac3
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/voc2012.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import tarfile
+import numpy as np
+from PIL import Image
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["VOC2012"]
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+MODE_FLAG_MAP = {'train': 'trainval', 'test': 'train', 'valid': "val"}
+
+
+class VOC2012(Dataset):
+    """
+    Implementation of `VOC2012 <http://host.robots.ox.ac.uk/pascal/VOC/voc2012/>`_ dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import VOC2012
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, image, label):
+		    return paddle.sum(image), label
+
+	    paddle.disable_static()
+
+	    voc2012 = VOC2012(mode='train')
+
+	    for i in range(10):
+		image, label= voc2012[i]
+		image = paddle.cast(paddle.to_tensor(image), 'float32')
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label= model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'valid', 'test'], \
+            "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+        self.flag = MODE_FLAG_MAP[mode.lower()]
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, VOC_URL, VOC_MD5, CACHE_DIR, download)
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_anno(self):
+        self.name2mem = {}
+        self.data_tar = tarfile.open(self.data_file)
+        for ele in self.data_tar.getmembers():
+            self.name2mem[ele.name] = ele
+
+        set_file = SET_FILE.format(self.flag)
+        sets = self.data_tar.extractfile(self.name2mem[set_file])
+
+        self.data = []
+        self.labels = []
+
+        for line in sets:
+            line = line.strip()
+            data = DATA_FILE.format(line.decode('utf-8'))
+            label = LABEL_FILE.format(line.decode('utf-8'))
+            self.data.append(data)
+            self.labels.append(label)
+
+    def __getitem__(self, idx):
+        data_file = self.data[idx]
+        label_file = self.labels[idx]
+
+        data = self.data_tar.extractfile(self.name2mem[data_file]).read()
+        label = self.data_tar.extractfile(self.name2mem[label_file]).read()
+        data = Image.open(io.BytesIO(data))
+        label = Image.open(io.BytesIO(label))
+        data = np.array(data)
+        label = np.array(label)
+        if self.transform is not None:
+            data = self.transform(data)
+        return data, label
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        if self.data_tar:
+            self.data_tar.close()
diff --git a/python/paddle/incubate/hapi/datasets/wmt14.py b/python/paddle/incubate/hapi/datasets/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..b495ea931a80425b8e24b81cdf8fdfd2c0920a3e
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/wmt14.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import tarfile
+import numpy as np
+import gzip
+
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['WMT14']
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+class WMT14(Dataset):
+    """
+    Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
+    The original WMT14 dataset is too large and a small set of data for set is
+    provided. This module will download dataset from
+    http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' or 'gen'. Default 'train'
+        dict_size(int): word dictionary size. Default -1.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of WMT14 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import WMT14
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src_ids, trg_ids, trg_ids_next):
+		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+
+	    paddle.disable_static()
+
+	    wmt14 = WMT14(mode='train', dict_size=50)
+
+	    for i in range(10):
+		src_ids, trg_ids, trg_ids_next = wmt14[i]
+		src_ids = paddle.to_tensor(src_ids)
+		trg_ids = paddle.to_tensor(trg_ids)
+		trg_ids_next = paddle.to_tensor(trg_ids_next)
+
+		model = SimpleNet()
+		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 dict_size=-1,
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'gen'], \
+            "mode should be 'train', 'test' or 'gen', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, URL_TRAIN, MD5_TRAIN, 'wmt14', download)
+
+        # read dataset into memory
+        assert dict_size > 0, "dict_size should be set as positive number"
+        self.dict_size = dict_size
+        self._load_data()
+
+    def _load_data(self):
+        def __to_dict(fd, size):
+            out_dict = dict()
+            for line_count, line in enumerate(fd):
+                if line_count < size:
+                    out_dict[cpt.to_text(line.strip())] = line_count
+                else:
+                    break
+            return out_dict
+
+        self.src_ids = []
+        self.trg_ids = []
+        self.trg_ids_next = []
+        with tarfile.open(self.data_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith("src.dict")
+            ]
+            assert len(names) == 1
+            self.src_dict = __to_dict(f.extractfile(names[0]), self.dict_size)
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith("trg.dict")
+            ]
+            assert len(names) == 1
+            self.trg_dict = __to_dict(f.extractfile(names[0]), self.dict_size)
+
+            file_name = "{}/{}".format(self.mode, self.mode)
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line = cpt.to_text(line)
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        self.src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [self.trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [self.trg_dict[END]]
+                    trg_ids = [self.trg_dict[START]] + trg_ids
+
+                    self.src_ids.append(src_ids)
+                    self.trg_ids.append(trg_ids)
+                    self.trg_ids_next.append(trg_ids_next)
+
+    def __getitem__(self, idx):
+        return (np.array(self.src_ids[idx]), np.array(self.trg_ids[idx]),
+                np.array(self.trg_ids_next[idx]))
+
+    def __len__(self):
+        return len(self.src_ids)
+
+    def get_dict(self, reverse=False):
+        if reverse:
+            src_dict = {v: k for k, v in six.iteritems(src_dict)}
+            trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
+        return src_dict, trg_dict
diff --git a/python/paddle/incubate/hapi/datasets/wmt16.py b/python/paddle/incubate/hapi/datasets/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3cb8bfacadd15f6c0f973a09dbf544bbc396c0
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/wmt16.py
@@ -0,0 +1,247 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+"""
+
+from __future__ import print_function
+
+import os
+import six
+import tarfile
+import numpy as np
+from collections import defaultdict
+
+import paddle
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['WMT16']
+
+DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+class WMT16(Dataset):
+    """
+    Implementation of `WMT16 <http://www.statmt.org/wmt16/>`_ test dataset.
+    ACL2016 Multimodal Machine Translation. Please see this website for more
+    details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+    If you use the dataset created for your task, please cite the following paper:
+    Multi30K: Multilingual English-German Image Descriptions.
+
+    .. code-block:: text
+
+        @article{elliott-EtAl:2016:VL16,
+         author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+         title     = {Multi30K: Multilingual English-German Image Descriptions},
+         booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+         year      = {2016},
+         pages     = {70--74},
+         year      = 2016
+        }
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' or 'val'. Default 'train'
+        src_dict_size(int): word dictionary size for source language word. Default -1.
+        trg_dict_size(int): word dictionary size for target language word. Default -1.
+        lang(str): source language, 'en' or 'de'. Default 'en'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of WMT16 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import WMT16
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src_ids, trg_ids, trg_ids_next):
+		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+
+	    paddle.disable_static()
+
+	    wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
+
+	    for i in range(10):
+		src_ids, trg_ids, trg_ids_next = wmt16[i]
+		src_ids = paddle.to_tensor(src_ids)
+		trg_ids = paddle.to_tensor(trg_ids)
+		trg_ids_next = paddle.to_tensor(trg_ids_next)
+
+		model = SimpleNet()
+		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 src_dict_size=-1,
+                 trg_dict_size=-1,
+                 lang='en',
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'val'], \
+            "mode should be 'train', 'test' or 'val', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'wmt16', download)
+
+        self.lang = lang
+        assert src_dict_size > 0, "dict_size should be set as positive number"
+        assert trg_dict_size > 0, "dict_size should be set as positive number"
+        self.src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if lang == "en"
+                                                 else TOTAL_DE_WORDS))
+        self.trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if lang == "en"
+                                                 else TOTAL_EN_WORDS))
+
+        # load source and target word dict
+        self.src_dict = self._load_dict(lang, src_dict_size)
+        self.trg_dict = self._load_dict("de" if lang == "en" else "en",
+                                        trg_dict_size)
+
+        # load data
+        self.data = self._load_data()
+
+    def _load_dict(self, lang, dict_size, reverse=False):
+        dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                                 "wmt16/%s_%d.dict" % (lang, dict_size))
+        dict_found = False
+        if os.path.exists(dict_path):
+            with open(dict_path, "rb") as d:
+                dict_found = len(d.readlines()) == dict_size
+        if not dict_found:
+            self._build_dict(dict_path, dict_size, lang)
+
+        word_dict = {}
+        with open(dict_path, "rb") as fdict:
+            for idx, line in enumerate(fdict):
+                if reverse:
+                    word_dict[idx] = cpt.to_text(line.strip())
+                else:
+                    word_dict[cpt.to_text(line.strip())] = idx
+        return word_dict
+
+    def _build_dict(self, dict_path, dict_size, lang):
+        word_dict = defaultdict(int)
+        with tarfile.open(self.data_file, mode="r") as f:
+            for line in f.extractfile("wmt16/train"):
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2: continue
+                sen = line_split[0] if self.lang == "en" else line_split[1]
+                for w in sen.split():
+                    word_dict[w] += 1
+
+        with open(dict_path, "wb") as fout:
+            fout.write(
+                cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
+            for idx, word in enumerate(
+                    sorted(
+                        six.iteritems(word_dict),
+                        key=lambda x: x[1],
+                        reverse=True)):
+                if idx + 3 == dict_size: break
+                fout.write(cpt.to_bytes(word[0]))
+                fout.write(cpt.to_bytes('\n'))
+
+    def _load_data(self):
+        # the index for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = self.src_dict[START_MARK]
+        end_id = self.src_dict[END_MARK]
+        unk_id = self.src_dict[UNK_MARK]
+
+        src_col = 0 if self.lang == "en" else 1
+        trg_col = 1 - src_col
+
+        self.src_ids = []
+        self.trg_ids = []
+        self.trg_ids_next = []
+        with tarfile.open(self.data_file, mode="r") as f:
+            for line in f.extractfile("wmt16/{}".format(self.mode)):
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    self.src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [self.trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                self.src_ids.append(src_ids)
+                self.trg_ids.append(trg_ids)
+                self.trg_ids_next.append(trg_ids_next)
+
+    def __getitem__(self, idx):
+        return (np.array(self.src_ids[idx]), np.array(self.trg_ids[idx]),
+                np.array(self.trg_ids_next[idx]))
+
+    def __len__(self):
+        return len(self.src_ids)
+
+    def get_dict(self, lang, reverse=False):
+        """
+	return the word dictionary for the specified language.
+
+	Args:
+	    lang(string): A string indicating which language is the source
+			  language. Available options are: "en" for English
+			  and "de" for Germany.
+	    reverse(bool): If reverse is set to False, the returned python
+			   dictionary will use word as key and use index as value.
+			   If reverse is set to True, the returned python
+			   dictionary will use index as key and word as value.
+
+	Returns:
+	    dict: The word dictionary for the specific language.
+	"""
+
+        dict_size = self.src_dict_size if lang == self.lang else self.trg_dict_size
+
+        dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                                 "wmt16/%s_%d.dict" % (lang, dict_size))
+        assert os.path.exists(dict_path), "Word dictionary does not exist. "
+        "Please invoke paddle.dataset.wmt16.train/test/validation first "
+        "to build the dictionary."
+        return _load_dict(lang, dict_size)
diff --git a/python/paddle/incubate/hapi/distributed.py b/python/paddle/incubate/hapi/distributed.py
index 585f466ea6a1ef5a3d888b7c46fe2908ffd2c769..0e38dc8edc758e9c1b8a96add1df242fb0aecef1 100644
--- a/python/paddle/incubate/hapi/distributed.py
+++ b/python/paddle/incubate/hapi/distributed.py
@@ -49,6 +49,13 @@ class DistributedBatchSampler(BatchSampler):
                      `__len__` for BatchSampler to get sample
                      number of data source.
         batch_size(int): sample indice number in a mini-batch indices.
+        num_replicas(int, optional): porcess number in distributed training.
+            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
+            Default None.
+        rank(int, optional): the rank of the current process among :attr:`num_replicas`
+            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
         shuffle(bool): whther to shuffle indices order before genrating
             batch indices. Default False.
         drop_last(bool): whether drop the last incomplete batch dataset size
@@ -84,7 +91,13 @@ class DistributedBatchSampler(BatchSampler):
                 break
     """
 
-    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False):
         self.dataset = dataset
 
         assert isinstance(batch_size, int) and batch_size > 0, \
@@ -96,9 +109,21 @@ class DistributedBatchSampler(BatchSampler):
         assert isinstance(drop_last, bool), \
                 "drop_last should be a boolean number"
 
+        if num_replicas is not None:
+            assert isinstance(num_replicas, int) and num_replicas > 0, \
+                    "num_replicas should be a positive integer"
+            self.nranks = num_replicas
+        else:
+            self.nranks = ParallelEnv().nranks
+
+        if rank is not None:
+            assert isinstance(rank, int) and rank >= 0, \
+                    "rank should be a non-negative integer"
+            self.local_rank = rank
+        else:
+            self.local_rank = ParallelEnv().local_rank
+
         self.drop_last = drop_last
-        self.nranks = ParallelEnv().nranks
-        self.local_rank = ParallelEnv().local_rank
         self.epoch = 0
         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
         self.total_size = self.num_samples * self.nranks
diff --git a/python/paddle/incubate/hapi/metrics.py b/python/paddle/incubate/hapi/metrics.py
deleted file mode 100644
index 9e9a2e78524022d7de8ca80a7fb8e3c478dacd36..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/metrics.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import abc
-import numpy as np
-import paddle.fluid as fluid
-
-import logging
-
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-__all__ = ['Metric', 'Accuracy']
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Metric(object):
-    """
-    Base class for metric, encapsulates metric logic and APIs
-    Usage:
-        
-        m = SomeMetric()
-        for prediction, label in ...:
-            m.update(prediction, label)
-        m.accumulate()
-        
-    Advanced usage for :code:`add_metric_op`
-    Metric calculation can be accelerated by calculating metric states
-    from model outputs and labels by Paddle OPs in :code:`add_metric_op`,
-    metric states will be fetch as numpy array and call :code:`update`
-    with states in numpy format.
-    Metric calculated as follows (operations in Model and Metric are
-    indicated with curly brackets, while data nodes not):
-                 inputs & labels              || ------------------
-                       |                      ||
-                    {model}                   ||
-                       |                      ||
-                outputs & labels              ||
-                       |                      ||    tensor data
-             {Metric.add_metric_op}           ||
-                       |                      ||
-              metric states(tensor)           ||
-                       |                      ||
-                {fetch as numpy}              || ------------------
-                       |                      ||
-              metric states(numpy)            ||    numpy data
-                       |                      ||
-                {Metric.update}               \/ ------------------
-    Examples:
-        
-        For :code:`Accuracy` metric, which takes :code:`pred` and :code:`label`
-        as inputs, we can calculate the correct prediction matrix between
-        :code:`pred` and :code:`label` in :code:`add_metric_op`.
-        For examples, prediction results contains 10 classes, while :code:`pred`
-        shape is [N, 10], :code:`label` shape is [N, 1], N is mini-batch size,
-        and we only need to calculate accurary of top-1 and top-5, we could
-        calculated the correct prediction matrix of the top-5 scores of the
-        prediction of each sample like follows, while the correct prediction
-        matrix shape is [N, 5].
-        .. code-block:: python
-            def add_metric_op(pred, label):
-                # sort prediction and slice the top-5 scores
-                pred = fluid.layers.argsort(pred, descending=True)[1][:, :5]
-                # calculate whether the predictions are correct
-                correct = pred == label
-                return fluid.layers.cast(correct, dtype='float32')
-        With the :code:`add_metric_op`, we split some calculations to OPs(which
-        may run on GPU devices, will be faster), and only fetch 1 tensor with
-        shape as [N, 5] instead of 2 tensors with shapes as [N, 10] and [N, 1].
-        :code:`update` can be define as follows:
-        .. code-block:: python
-            def update(self, correct):
-                accs = []
-                for i, k in enumerate(self.topk):
-                    num_corrects = correct[:, :k].sum()
-                    num_samples = len(correct)
-                    accs.append(float(num_corrects) / num_samples)
-                    self.total[i] += num_corrects
-                    self.count[i] += num_samples
-                return accs
-    """
-
-    def __init__(self):
-        pass
-
-    @abc.abstractmethod
-    def reset(self):
-        """
-        Reset states and result
-        """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def update(self, *args):
-        """
-        Update states for metric
-
-        Inputs of :code:`update` is the outputs of :code:`Metric.add_metric_op`,
-        if :code:`add_metric_op` is not defined, the inputs of :code:`update`
-        will be flatten arguments of **output** of mode and **label** from data:
-        :code:`update(output1, output2, ..., label1, label2,...)`
-
-        see :code:`Metric.add_metric_op`
-        """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def accumulate(self):
-        """
-        Accumulates statistics, computes and returns the metric value
-        """
-        raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__))
-
-    @abc.abstractmethod
-    def name(self):
-        """
-        Returns metric name
-        """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    def add_metric_op(self, *args):
-        """
-        This API is advanced usage to accelerate metric calculating, calulations
-        from outputs of model to the states which should be updated by Metric can
-        be defined here, where Paddle OPs is also supported. Outputs of this API
-        will be the inputs of "Metric.update".
-
-        If :code:`add_metric_op` is defined, it will be called with **outputs**
-        of model and **labels** from data as arguments, all outputs and labels
-        will be concatenated and flatten and each filed as a separate argument
-        as follows:
-        :code:`add_metric_op(output1, output2, ..., label1, label2,...)`
-
-        If :code:`add_metric_op` is not defined, default behaviour is to pass
-        input to output, so output format will be:
-        :code:`return output1, output2, ..., label1, label2,...`
-
-        see :code:`Metric.update`
-        """
-        return args
-
-
-class Accuracy(Metric):
-    """
-    Encapsulates accuracy metric logic
-
-    Examples:
-        
-        .. code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import paddle.incubate.hapi as hapi
-
-        fluid.enable_dygraph()
-
-        train_dataset = hapi.datasets.MNIST(mode='train')
-
-        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
-        optim = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=model.parameters())
-        model.prepare(
-            optim,
-            loss_function=paddle.nn.CrossEntropyLoss(),
-            metrics=hapi.metrics.Accuracy())
-
-        model.fit(train_dataset, batch_size=64)
-
-    """
-
-    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
-        super(Accuracy, self).__init__(*args, **kwargs)
-        self.topk = topk
-        self.maxk = max(topk)
-        self._init_name(name)
-        self.reset()
-
-    def add_metric_op(self, pred, label, *args):
-        pred = fluid.layers.argsort(pred, descending=True)[1][:, :self.maxk]
-        correct = pred == label
-        return fluid.layers.cast(correct, dtype='float32')
-
-    def update(self, correct, *args):
-        accs = []
-        for i, k in enumerate(self.topk):
-            num_corrects = correct[:, :k].sum()
-            num_samples = len(correct)
-            accs.append(float(num_corrects) / num_samples)
-            self.total[i] += num_corrects
-            self.count[i] += num_samples
-        return accs
-
-    def reset(self):
-        self.total = [0.] * len(self.topk)
-        self.count = [0] * len(self.topk)
-
-    def accumulate(self):
-        res = []
-        for t, c in zip(self.total, self.count):
-            res.append(float(t) / c)
-        return res
-
-    def _init_name(self, name):
-        name = name or 'acc'
-        if self.maxk != 1:
-            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
-        else:
-            self._name = [name]
-
-    def name(self):
-        return self._name
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index 0b12987b10a0510e1035e2b64439de9abe3fcf31..e4a6b03f7aa5c7f537dc476d8f80162e530d5dbe 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,19 +24,27 @@ import six
 import warnings
 from collections import Iterable
 
+import paddle
 from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
+from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
+# Note: Use alias `Input` temporarily before releasing hapi feature.
+from paddle.static import InputSpec as Input
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
+from paddle.fluid.executor import scope_guard, Executor
 from paddle.io import DataLoader, Dataset
 
+from paddle.fluid.dygraph.layers import Layer
+from paddle.metric import Metric
+
 from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from .metrics import Metric
 from .callbacks import config_callbacks
 from .utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
 from .device import _get_device
@@ -47,40 +55,6 @@ __all__ = [
 ]
 
 
-class Input(fluid.dygraph.Layer):
-    """
-    Define inputs the model.
-
-    Args:
-        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
-            for more details.
-        shape (tuple(integers)|list[integers]): List|Tuple of integers
-            declaring the shape. You can set "None" or -1 at a dimension
-            to indicate the dimension can be of any size. For example,
-            it is useful to set changeable batch size as "None" or -1.
-        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
-            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
-            uint8. Default: float32.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle.incubate.hapi as hapi
-
-        input = hapi.Input('x', [None, 784], 'float32')
-        label = hapi.Input('label', [None, 1], 'int64')
-    """
-
-    def __init__(self, name, shape=None, dtype='float32'):
-        super(Input, self).__init__()
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-
-    def forward(self):
-        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
-
-
 class StaticGraphAdapter(object):
     """
     Model traning/inference with a static graph.
@@ -388,13 +362,13 @@ class StaticGraphAdapter(object):
         with fluid.program_guard(prog, self._startup_prog):
             inputs = self.model._inputs
             labels = self.model._labels if self.model._labels else []
-            inputs = [k.forward() for k in to_list(inputs)]
-            labels = [k.forward() for k in to_list(labels)]
+            inputs = [k._create_feed_layer() for k in to_list(inputs)]
+            labels = [k._create_feed_layer() for k in to_list(labels)]
             self._label_vars[mode] = labels
             outputs = to_list(self.model.network.forward(*inputs))
 
-            if mode != 'test' and self.model._loss_function:
-                losses = self.model._loss_function(*(outputs + labels))
+            if mode != 'test' and self.model._loss:
+                losses = self.model._loss(*(outputs + labels))
 
             if self._nranks > 1 and mode != 'train':
                 outputs = [_all_gather(o, self._nranks) for o in outputs]
@@ -403,8 +377,7 @@ class StaticGraphAdapter(object):
 
             if mode != 'test':
                 for metric in self.model._metrics:
-                    metrics.append(
-                        to_list(metric.add_metric_op(*(outputs + labels))))
+                    metrics.append(to_list(metric.compute(*(outputs + labels))))
 
             if mode == 'train' and self.model._optimizer:
                 self._loss_endpoint = fluid.layers.sum(losses)
@@ -509,7 +482,7 @@ class DynamicGraphAdapter(object):
 
         if self._nranks > 1:
             outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
             final_loss = fluid.layers.sum(losses)
             final_loss = self.ddp_model.scale_loss(final_loss)
@@ -518,7 +491,7 @@ class DynamicGraphAdapter(object):
         else:
             outputs = self.model.network.forward(
                 * [to_variable(x) for x in inputs])
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
             final_loss = fluid.layers.sum(losses)
             final_loss.backward()
@@ -527,7 +500,7 @@ class DynamicGraphAdapter(object):
         self.model.network.clear_gradients()
         metrics = []
         for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(*(to_list(outputs) + labels))
+            metric_outs = metric.compute(*(to_list(outputs) + labels))
             m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
@@ -542,8 +515,8 @@ class DynamicGraphAdapter(object):
         labels = [to_variable(l) for l in to_list(labels)]
 
         outputs = self.model.network.forward(* [to_variable(x) for x in inputs])
-        if self.model._loss_function:
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+        if self.model._loss:
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
 
         if self._nranks > 1:
@@ -571,13 +544,13 @@ class DynamicGraphAdapter(object):
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
 
-            metric_outs = metric.add_metric_op(*(to_list(outputs) + labels))
+            metric_outs = metric.compute(*(to_list(outputs) + labels))
             m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
-        if self.model._loss_function and len(metrics):
+        if self.model._loss and len(metrics):
             return [to_numpy(l) for l in losses], metrics
-        elif self.model._loss_function:
+        elif self.model._loss:
             return [to_numpy(l) for l in losses]
         else:
             return metrics
@@ -665,21 +638,21 @@ class Model(object):
     """
     An Model object is network with training and inference features.
     Dynamic graph and static graph are supported at the same time,
-    switched by `fluid.enable_dygraph()`. The usage is as follows.
+    switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, hapi.Input,
     must be required for static graph.
 
     Args:
-        network (fluid.dygraph.Layer): The network is an instance of
-            fluid.dygraph.Layer.
+        network (paddle.nn.Layer): The network is an instance of
+            paddle.nn.Layer.
         inputs (Input|list|dict|None): `inputs`, entry points of network,
             could be a Input layer, or lits of Input layers,
             or dict (name: Input), or None. For static graph,
             inputs must be set. For dynamic graph, it could be None.
         labels (Input|list|None): `labels`, entry points of network,
             could be a Input layer or lits of Input layers, or None.
-            For static graph, if labels is required in loss_function,
+            For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None.
 
 
@@ -687,13 +660,12 @@ class Model(object):
         .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import paddle.incubate.hapi as hapi
         
-        class MyNet(fluid.dygraph.Layer):
+        class MyNet(paddle.nn.Layer):
             def __init__(self, classifier_act=None):
                 super(MyNet, self).__init__()
-                self._fc1 = fluid.dygraph.Linear(784, 200, act=classifier_act)
+                self._fc1 = paddle.nn.Linear(784, 200, act=classifier_act)
 
             def forward(self, x):
                 y = self._fc1(x)
@@ -701,18 +673,18 @@ class Model(object):
         
         device = hapi.set_device('gpu')
         # if use static graph, do not set
-        fluid.enable_dygraph(device)
+        paddle.disable_static(device)
         
         # inputs and labels are not required for dynamic graph.
-        input = hapi.Input('x', [None, 784], 'float32')
-        label = hapi.Input('label', [None, 1], 'int64')
+        input = hapi.Input([None, 784], 'float32', 'x')
+        label = hapi.Input([None, 1], 'int64', 'label')
         
         model = hapi.Model(MyNet(), input, label)
-        optim = fluid.optimizer.SGD(learning_rate=1e-3,
+        optim = paddle.optimizer.SGD(learning_rate=1e-3,
             parameter_list=model.parameters())
         model.prepare(optim,
                       paddle.nn.CrossEntropyLoss(),
-                      hapi.metrics.Accuracy())
+                      paddle.metric.Accuracy())
         
         mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
         model.fit(mnist_data, epochs=2, batch_size=32, verbose=1)
@@ -724,7 +696,7 @@ class Model(object):
         self.network = network
         self._inputs = None
         self._labels = None
-        self._loss_function = None
+        self._loss = None
         self._loss_weights = None
         self._optimizer = None
         self._optimizer = None
@@ -734,16 +706,8 @@ class Model(object):
             if not isinstance(inputs, (list, dict, Input)):
                 raise TypeError(
                     "'inputs' must be list or dict in static graph mode")
-        if inputs is None:
-            self._inputs = [Input(name=n) \
-                for n in extract_args(self.network.forward) if n != 'self']
-        elif isinstance(input, dict):
-            self._inputs = [inputs[n] \
-                for n in extract_args(self.network.forward) if n != 'self']
-        else:
-            self._inputs = to_list(inputs)
-
-        self._labels = to_list(labels)
+        self._inputs = self._verify_spec(inputs, True)
+        self._labels = self._verify_spec(labels)
 
         # init backend
         if fluid.in_dygraph_mode():
@@ -772,25 +736,24 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self, classifier_act=None):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 10, act=classifier_act)
+                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
 
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
-              input = hapi.Input('x', [None, 784], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 784], 'float32', 'x')
+              label = hapi.Input([None, 1], 'int64', 'label')
               model = hapi.Model(MyNet(), input, label)
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
+              optim = paddle.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
               model.prepare(optim, paddle.nn.CrossEntropyLoss())
               data = np.random.random(size=(4,784)).astype(np.float32)
@@ -821,25 +784,24 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self, classifier_act=None):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 10, act=classifier_act)
+                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
 
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
-              input = hapi.Input('x', [None, 784], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 784], 'float32', 'x')
+              label = hapi.Input([None, 1], 'int64', 'label')
               model = hapi.Model(MyNet(), input, label)
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
+              optim = paddle.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
               model.prepare(optim,
                             paddle.nn.CrossEntropyLoss())
@@ -867,46 +829,54 @@ class Model(object):
             .. code-block:: python
             
               import numpy as np
-              import paddle.fluid as fluid
+              import paddle
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
               model = hapi.Model(MyNet())
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
-              out = model.eval_batch([data])
+              out = model.test_batch([data])
               print(out)
         """
         return self._adapter.test_batch(inputs)
 
-    def save(self, path):
-        """
-        This function saves parameters, optimizer infomation to path.
+    def save(self, path, training=True):
+        """  
+        This function saves parameters, optimizer information or model and 
+        paramters only for inference to path. It depends on the parameter
+        `training`.
 
-        The parameters contains all the trainable Variable, will save to
-        a file with suffix ".pdparams".
+        If `training` is set to True, the parameters saved contain all 
+        the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
         information will save to a file with suffix ".pdopt". (If the optimizer
         have no variable need to save (like SGD), the fill will not generated).
+        This function will silently overwrite existing file at the target location.
 
-        This function will silently overwrite existing file
-        at the target location.
+        If `training` is set to False, only inference model will be saved. It 
+        should be noted that before using `save`, you should run the model, and 
+        the shape of input you saved is as same as the input of its running.
+        `@paddle.jit.to_static` must be added on `forward` function of your layer 
+        in dynamic mode now and these will be optimized later.
 
         Args:
             path (str): The file prefix to save model. The format is
                 'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
                  will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. Default: True.
 
         Returns:
             None
@@ -914,25 +884,47 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
-              import paddle.fluid as fluid
-              import paddle.incubate.hapi as hapi
-              
-              class MyNet(fluid.dygraph.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                import paddle
+                import paddle.incubate.hapi as hapi
+                from paddle.nn import Linear
+                from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+                class Mnist(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyNet, self).__init__()
+                        self._fc = Linear(784, 1, act='softmax')
+
+                  @paddle.jit.to_static # If save for inference in dygraph, need this
                   def forward(self, x):
                       y = self._fc(x)
                       return y
-              
-              device = hapi.set_device('cpu')
-              fluid.enable_dygraph(device)
-              model = hapi.Model(MyNet())
-              model.save('checkpoint/test')
+
+                dynamic = True # False
+                device = hapi.set_device('cpu')
+                # if use static graph, do not set
+                paddle.disable_static(device) if dynamic else None
+
+                # inputs and labels are not required for dynamic graph.
+                input = hapi.Input([None, 784], 'float32', 'x')
+                label = hapi.Input([None, 1], 'int64', 'label')
+
+                model = hapi.Model(Mnist(), input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameter_list=model.parameters())
+                model.prepare(optim,
+                                paddle.nn.CrossEntropyLoss(),
+                                hapi.metrics.Accuracy())
+                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
+                model.save('checkpoint/test') # save for training
+                model.save('inference_model', False) # save for inference
         """
+
         if ParallelEnv().local_rank == 0:
-            self._adapter.save(path)
+            if not training:
+                self._save_inference_model(path)
+            else:
+                self._adapter.save(path)
 
     def load(self, path, skip_mismatch=False, reset_optimizer=False):
         """
@@ -967,19 +959,19 @@ class Model(object):
 
             .. code-block:: python
             
-              import paddle.fluid as fluid
+              import paddle
               import paddle.incubate.hapi as hapi
               
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
               
               device = hapi.set_device('cpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
               model = hapi.Model(MyNet())
               model.load('checkpoint/test')
         """
@@ -1042,24 +1034,24 @@ class Model(object):
 
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.incubate.hapi import Model
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(20, 10, act='softmax')
+                      self._fc = paddle.nn.Linear(20, 10, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
-              fluid.enable_dygraph()
+              paddle.disable_static()
               model = Model(MyNet())
               params = model.parameters()
         """
         return self._adapter.parameters()
 
-    def prepare(self, optimizer=None, loss_function=None, metrics=None):
+    def prepare(self, optimizer=None, loss=None, metrics=None):
         """
         Configures the model before runing.
 
@@ -1067,8 +1059,8 @@ class Model(object):
             optimizer (Optimizer|None): Optimizer must be set in training
                 and should be a Optimizer instance. It can be None in eval
                 and test mode.
-            loss_function (Loss|callable function|None): Loss function can
-                be a `fluid.dygraph.Layer` instance or any callable function
+            loss (Loss|callable function|None): Loss function can
+                be a `paddle.nn.Layer` instance or any callable function
                 taken the predicted values and ground truth values as input.
                 It can be None when there is no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
@@ -1087,7 +1079,7 @@ class Model(object):
                     startup_prog_seed = fluid.default_startup_program(
                     ).random_seed
                     fluid.disable_dygraph()
-                    fluid.enable_dygraph(self._place)
+                    paddle.disable_static(self._place)
                     # enable_dygraph would create and switch to a new program,
                     # thus also copy seed to the new program
                     fluid.default_main_program().random_seed = main_prog_seed
@@ -1099,12 +1091,11 @@ class Model(object):
                 _parallel_context_initialized = True
 
         self._optimizer = optimizer
-        if loss_function:
-            if not isinstance(loss_function, fluid.dygraph.Layer) or \
-               not callable(loss_function):
-                raise TypeError("'loss_function' must be sub classes of \
-                    `fluid.dygraph.Layer` or any callable function.")
-        self._loss_function = loss_function
+        if loss is not None:
+            if not isinstance(loss, paddle.nn.Layer) and not callable(loss):
+                raise TypeError("'loss' must be sub classes of " \
+                    "`paddle.nn.Layer` or any callable function.")
+        self._loss = loss
 
         metrics = metrics or []
         for metric in to_list(metrics):
@@ -1184,27 +1175,26 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
               dynamic = True
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device) if dynamic else None
+              paddle.disable_static(device) if dynamic else None
            
               train_dataset = hapi.datasets.MNIST(mode='train')
               val_dataset = hapi.datasets.MNIST(mode='test')
            
-              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
+              label = hapi.Input([None, 1], 'int64', 'label')
            
               model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
                   input, label)
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
               model.prepare(
                   optim,
                   paddle.nn.CrossEntropyLoss(),
-                  hapi.metrics.Accuracy(topk=(1, 2)))
+                  paddle.metric.Accuracy(topk=(1, 2)))
               model.fit(train_dataset,
                         val_dataset,
                         epochs=2,
@@ -1217,31 +1207,30 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
               dynamic = True
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device) if dynamic else None
+              paddle.disable_static(device) if dynamic else None
            
               train_dataset = hapi.datasets.MNIST(mode='train')
-              train_loader = fluid.io.DataLoader(train_dataset,
+              train_loader = paddle.io.DataLoader(train_dataset,
                   places=device, batch_size=64)
               val_dataset = hapi.datasets.MNIST(mode='test')
-              val_loader = fluid.io.DataLoader(val_dataset,
+              val_loader = paddle.io.DataLoader(val_dataset,
                   places=device, batch_size=64)
            
-              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
+              label = hapi.Input([None, 1], 'int64', 'label')
            
               model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
                   input, label)
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
               model.prepare(
                   optim,
                   paddle.nn.CrossEntropyLoss(),
-                  hapi.metrics.Accuracy(topk=(1, 2)))
+                  paddle.metric.Accuracy(topk=(1, 2)))
               model.fit(train_loader,
                         val_loader,
                         epochs=2,
@@ -1353,24 +1342,24 @@ class Model(object):
         Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import paddle.incubate.hapi as hapi
 
             # declarative mode
             val_dataset = hapi.datasets.MNIST(mode='test')
 
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
-            label = hapi.Input('label', [None, 1], 'int64')
+            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
+            label = hapi.Input([None, 1], 'int64', 'label')
             model = hapi.Model(hapi.vision.LeNet(), input, label)
-            model.prepare(metrics=hapi.metrics.Accuracy())
+            model.prepare(metrics=paddle.metric.Accuracy())
 
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
 
             # imperative mode
-            fluid.enable_dygraph()
+            paddle.disable_static()
             model = hapi.Model(hapi.vision.LeNet())
-            model.prepare(metrics=hapi.metrics.Accuracy())
+            model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
                 
@@ -1433,12 +1422,13 @@ class Model(object):
             num_workers (int): The number of subprocess to load data, 0 for no subprocess 
                 used and loading data in main process. When train_data and eval_data are
                 both the instance of Dataloader, this argument will be ignored. Default: 0.
-            stack_output (bool): Whether stack output field like a batch, as for an output
+            stack_outputs (bool): Whether stack output field like a batch, as for an output
                 filed of a sample is in shape [X, Y], test_data contains N samples, predict
                 output field will be in shape [N, X, Y] if stack_output is True, and will
                 be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
                 is False. stack_outputs as False is used for LoDTensor output situation,
                 it is recommended set as True if outputs contains no LoDTensor. Default: False.
+            callbacks(Callback): A Callback instance, default None.
         Returns:
             list: output of models.
 
@@ -1446,7 +1436,7 @@ class Model(object):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
+            import paddle
             import paddle.incubate.hapi as hapi
 
             class MnistDataset(hapi.datasets.MNIST):
@@ -1466,7 +1456,7 @@ class Model(object):
             test_dataset = MnistDataset(mode='test', return_label=False)
 
             # declarative mode
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
+            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
             model = hapi.Model(hapi.vision.LeNet(), input)
             model.prepare()
 
@@ -1475,7 +1465,7 @@ class Model(object):
 
             # imperative mode
             device = hapi.set_device('cpu')
-            fluid.enable_dygraph(device)
+            paddle.disable_static(device)
             model = hapi.Model(hapi.vision.LeNet())
             model.prepare()
             result = model.predict(test_dataset, batch_size=64)
@@ -1519,13 +1509,17 @@ class Model(object):
         cbks.on_end('test', logs)
         return outputs
 
-    def save_inference_model(self,
-                             save_dir,
-                             model_filename=None,
-                             params_filename=None,
-                             model_only=False):
+    def _save_inference_model(self,
+                              save_dir,
+                              model_filename=None,
+                              params_filename=None,
+                              model_only=False):
         """
-        Save inference model must in static mode.
+        Save inference model can be in static or dynamic mode.
+        It should be noted that before using `save_inference_model`, you should
+        run the model, and the shape you saved is as same as the input of its
+        running. `@paddle.jit.to_static` must be added on `forward` function of
+        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1541,40 +1535,142 @@ class Model(object):
         Returns:
             list: The fetch variables' name list
 
-
         Examples:
         .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.static import InputSpec
 
-            import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
-
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
-            model = hapi.Model(hapi.vision.LeNet(), input)
-            model.prepare()
-
+            from paddle.nn import Linear
+            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+            class Mnist(Layer):
+                def __init__(self, classifier_act=None):
+                    super(Mnist, self).__init__()
+
+                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
+
+                @paddle.jit.to_static # In static mode, you need to delete this.
+                def forward(self, inputs):
+                    outputs = self.fc(inputs)
+                    return outputs
+
+            dynamic = True # False
+            device = hapi.set_device('gpu')
+
+            # if use static graph, do not set
+            paddle.disable_static(device) if dynamic else None
+
+            # inputs and labels are not required for dynamic graph.
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            model = hapi.Model(Mnist(), input, label)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                parameter_list=model.parameters())
+            model.prepare(optim,
+                            paddle.nn.CrossEntropyLoss(),
+                            hapi.metrics.Accuracy())
+            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
             model.save_inference_model('inference_model')
         """
-        assert not fluid.in_dygraph_mode(
-        ), 'Save inference model must in static mode!'
 
-        prog = self._adapter._progs.get('test', None)
-        assert prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        def get_inout_spec(all_vars, return_name=False):
+            result_list = []
+            valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+            result_list = valid_vars
+            if return_name:
+                result_list = [var.name for var in result_list]
 
-        infer_prog = prog.clone(for_test=True)
+            return result_list
 
-        input_names = [v.name for v in self._adapter._input_vars['test']]
-        endpoints = self._adapter._endpoints['test']['output']
+        # TODO:
+        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
+        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
+        #    the inputs of the model in running.
+        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
+        if fluid.in_dygraph_mode():
+            layer = self.network
+            fluid.disable_dygraph()
+
+            # 1. input check
+            prog_translator = ProgramTranslator()
+            if not prog_translator.enable_declarative:
+                raise RuntimeError(
+                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                )
+            if not isinstance(layer, Layer):
+                raise TypeError(
+                    "The input layer should be 'Layer', but received layer type is %s."
+                    % type(layer))
+
+            # 2. get program of declarative Layer.forward
+            concrete_program = layer.forward.concrete_program
+
+            # NOTE: we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in layer.state_dict().items():
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            scope = core.Scope()
+            extra_var_info = dict()
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                extra_info_dict = dict()
+                if param_or_buffer.name in state_names_dict:
+                    extra_info_dict['structured_name'] = state_names_dict[
+                        param_or_buffer.name]
+                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                if isinstance(param_or_buffer, ParamBase):
+                    extra_info_dict['trainable'] = param_or_buffer.trainable
+                extra_var_info[param_or_buffer.name] = extra_info_dict
+
+            # 4. build input & output spec
+            input_var_names = get_inout_spec(concrete_program.inputs, True)
+            output_vars = get_inout_spec(concrete_program.outputs)
+
+            # 5. save inference model
+            with scope_guard(scope):
+                return fluid.io.save_inference_model(
+                    dirname=save_dir,
+                    feeded_var_names=input_var_names,
+                    target_vars=output_vars,
+                    executor=Executor(_current_expected_place()),
+                    main_program=concrete_program.main_program.clone(),
+                    model_filename=model_filename,
+                    params_filename=params_filename,
+                    program_only=model_only)
+
+        else:
+            prog = self._adapter._progs.get('test', None)
+            assert prog, \
+                "Model is not ready, please call `model.prepare()` first"
 
-        return fluid.io.save_inference_model(
-            save_dir,
-            input_names,
-            endpoints,
-            self._adapter._executor,
-            main_program=infer_prog,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            program_only=model_only)
+            infer_prog = prog.clone(for_test=True)
+
+            input_names = [v.name for v in self._adapter._input_vars['test']]
+            endpoints = self._adapter._endpoints['test']['output']
+
+            return fluid.io.save_inference_model(
+                save_dir,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+                program_only=model_only)
 
     def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
         outputs = []
@@ -1601,9 +1697,9 @@ class Model(object):
             if mode != 'test':
                 outs = getattr(self, mode + '_batch')(data[:len(self._inputs)],
                                                       data[len(self._inputs):])
-                if self._metrics and self._loss_function:
+                if self._metrics and self._loss:
                     metrics = [[l[0] for l in outs[0]]]
-                elif self._loss_function:
+                elif self._loss:
                     metrics = [[l[0] for l in outs]]
                 else:
                     metrics = []
@@ -1639,12 +1735,43 @@ class Model(object):
             return logs, outputs
         return logs
 
+    def _verify_spec(self, specs, is_input=False):
+        out_specs = []
+
+        if specs is None:
+            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
+            # to generate `Input`. But how can we know the actual shape of each input tensor?
+            if is_input:
+                out_specs = [
+                    Input(
+                        name=n, shape=[None])
+                    for n in extract_args(self.network.forward) if n != 'self'
+                ]
+            else:
+                out_specs = to_list(specs)
+        elif isinstance(specs, dict):
+            assert is_input == False
+            out_specs = [specs[n] \
+                for n in extract_args(self.network.forward) if n != 'self']
+        else:
+            out_specs = to_list(specs)
+        # Note: checks each element has specificed `name`.
+        if out_specs is not None:
+            for i, spec in enumerate(out_specs):
+                assert isinstance(spec, Input)
+                if spec.name is None:
+                    raise ValueError(
+                        "Requires Input[{}].name != None, but receive `None` with {}.".
+                        format(i, spec))
+
+        return out_specs
+
     def _reset_metrics(self):
         for metric in self._metrics:
             metric.reset()
 
     def _metrics_name(self):
-        metrics_name = ['loss'] if self._loss_function else []
+        metrics_name = ['loss'] if self._loss else []
         for m in self._metrics:
             metrics_name.extend(to_list(m.name()))
         return metrics_name
diff --git a/python/paddle/incubate/hapi/tests/CMakeLists.txt b/python/paddle/incubate/hapi/tests/CMakeLists.txt
index 5cad495de7c88781de50de9b2bbe1a765a45582f..8ffcd67443f1c8722560da20d9cfb76b18a67351 100644
--- a/python/paddle/incubate/hapi/tests/CMakeLists.txt
+++ b/python/paddle/incubate/hapi/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ endforeach()
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150) 
 
 
 function(py_dist_test TARGET_NAME)
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
index b338f3310b4c796e66d88b21f1bb8353dbf5b572..ede99a50c2fa72da3bd1999204a5fe1e5a656be2 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
@@ -25,7 +25,7 @@ from paddle import fluid
 from paddle.incubate.hapi import Model, Input, set_device
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.callbacks import ProgBarLogger
 from paddle.incubate.hapi.datasets import MNIST
 
@@ -64,8 +64,8 @@ class TestDistTraning(unittest.TestCase):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input('image', im_shape, 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input(im_shape, 'float32', 'image')]
+        labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.Momentum(
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
index 1484620a4efdfff0c084153e9edb001833d744ef..28305fc6a6fd08c160f946920e85391cd444caef 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
@@ -25,7 +25,7 @@ from paddle import fluid
 from paddle.incubate.hapi import Model, Input, set_device
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.callbacks import ProgBarLogger
 from paddle.incubate.hapi.datasets import MNIST
 
@@ -63,8 +63,8 @@ class TestDistTraning(unittest.TestCase):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input('image', im_shape, 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input(im_shape, 'float32', 'image')]
+        labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.Momentum(
diff --git a/python/paddle/incubate/hapi/tests/test_callbacks.py b/python/paddle/incubate/hapi/tests/test_callbacks.py
index 2a8a470736d921628edadb55b7e0cc956e2f37f1..e49bf215c276c8b495b0f991a5821d4c674f48d2 100644
--- a/python/paddle/incubate/hapi/tests/test_callbacks.py
+++ b/python/paddle/incubate/hapi/tests/test_callbacks.py
@@ -36,7 +36,7 @@ class TestCallbacks(unittest.TestCase):
         freq = 2
         eval_steps = 20
 
-        inputs = [Input('image', [None, 1, 28, 28], 'float32')]
+        inputs = [Input([None, 1, 28, 28], 'float32', 'image')]
         lenet = Model(LeNet(), inputs)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_cifar.py b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d9f4353c0ed639f5ad907c921bf7b2c88271f5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestCifar10Train(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar10(mode='train')
+        self.assertTrue(len(cifar) == 50000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 50000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestCifar10Test(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar10(mode='test')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestCifar100Train(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar100(mode='train')
+        self.assertTrue(len(cifar) == 50000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 50000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 99)
+
+
+class TestCifar100Test(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar100(mode='test')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_conll05.py b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed2a4180d0cb341f5d57bdf1cb9d8ef145a44fb
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestConll05st(unittest.TestCase):
+    def test_main(self):
+        conll05st = Conll05st()
+        self.assertTrue(len(conll05st) == 5267)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5267)
+        sample = conll05st[idx]
+        self.assertTrue(len(sample) == 9)
+        for s in sample:
+            self.assertTrue(len(s.shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imdb.py b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef73634b6b5fb114fa88b785bb77a87fe129bd5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestImdbTrain(unittest.TestCase):
+    def test_main(self):
+        imdb = Imdb(mode='train')
+        self.assertTrue(len(imdb) == 25000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 25000)
+        data, label = imdb[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
+        self.assertTrue(int(label) in [0, 1])
+
+
+class TestImdbTest(unittest.TestCase):
+    def test_main(self):
+        imdb = Imdb(mode='test')
+        self.assertTrue(len(imdb) == 25000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 25000)
+        data, label = imdb[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
+        self.assertTrue(int(label) in [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3d97d314acbf7f55a8482fd386581fef7f16e03
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestImikolovTrain(unittest.TestCase):
+    def test_main(self):
+        imikolov = Imikolov(mode='train', data_type='NGRAM', window_size=2)
+        self.assertTrue(len(imikolov) == 929589)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 929589)
+        data = imikolov[idx]
+        self.assertTrue(len(data) == 2)
+
+
+class TestImikolovTest(unittest.TestCase):
+    def test_main(self):
+        imikolov = Imikolov(mode='test', data_type='NGRAM', window_size=2)
+        self.assertTrue(len(imikolov) == 82430)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 82430)
+        data = imikolov[idx]
+        self.assertTrue(len(data) == 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a7a3035ee0e86f8ee2fa9e8a23f6036758d2d
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestMovieReviewsTrain(unittest.TestCase):
+    def test_main(self):
+        movie_reviews = MovieReviews(mode='train')
+        self.assertTrue(len(movie_reviews) == 1600)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1600)
+        data = movie_reviews[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(int(data[1]) in [0, 1])
+
+
+class TestMovieReviewsTest(unittest.TestCase):
+    def test_main(self):
+        movie_reviews = MovieReviews(mode='test')
+        self.assertTrue(len(movie_reviews) == 400)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 400)
+        data = movie_reviews[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(int(data[1]) in [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movielens.py b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94269f930e05e04b3bdfc4324e5ae1ea15b1fb9
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestMovielensTrain(unittest.TestCase):
+    def test_main(self):
+        movielens = Movielens(mode='train')
+        # movielens dataset random split train/test
+        # not check dataset length here
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 900000)
+        data = movielens[idx]
+        self.assertTrue(len(data) == 8)
+        for i, d in enumerate(data):
+            self.assertTrue(len(d.shape) == 1)
+            if i not in [5, 6]:
+                self.assertTrue(d.shape[0] == 1)
+
+
+class TestMovielensTest(unittest.TestCase):
+    def test_main(self):
+        movielens = Movielens(mode='test')
+        # movielens dataset random split train/test
+        # not check dataset length here
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 100000)
+        data = movielens[idx]
+        self.assertTrue(len(data) == 8)
+        for i, d in enumerate(data):
+            self.assertTrue(len(d.shape) == 1)
+            if i not in [5, 6]:
+                self.assertTrue(d.shape[0] == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..768367bff9911a352ea6b13f279d5b71938bc85b
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestUCIHousingTrain(unittest.TestCase):
+    def test_main(self):
+        uci_housing = UCIHousing(mode='train')
+        self.assertTrue(len(uci_housing) == 404)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 404)
+        data = uci_housing[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(data[0].shape[0] == 13)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(data[1].shape[0] == 1)
+
+
+class TestUCIHousingTest(unittest.TestCase):
+    def test_main(self):
+        uci_housing = UCIHousing(mode='test')
+        self.assertTrue(len(uci_housing) == 102)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 102)
+        data = uci_housing[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(data[0].shape[0] == 13)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(data[1].shape[0] == 1)
+
+
+class TestWMT14Train(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='train', dict_size=50)
+        self.assertTrue(len(wmt14) == 191155)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 191155)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Test(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='test', dict_size=50)
+        self.assertTrue(len(wmt14) == 5957)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5957)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Gen(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='gen', dict_size=50)
+        self.assertTrue(len(wmt14) == 3001)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3001)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_voc.py b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..85766ab8e30a3a7abd5e2966e6353b116c03e926
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import voc2012, VOC2012
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+# VOC2012 is too large for unittest to download, stub a small dataset here
+voc2012.VOC_URL = 'https://paddlemodels.bj.bcebos.com/voc2012_stub/VOCtrainval_11-May-2012.tar'
+voc2012.VOC_MD5 = '34cb1fe5bdc139a5454b25b16118fff8'
+
+
+class TestVOC2012Train(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='train')
+        self.assertTrue(len(voc2012) == 3)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+class TestVOC2012Valid(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='valid')
+        self.assertTrue(len(voc2012) == 1)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+class TestVOC2012Test(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='test')
+        self.assertTrue(len(voc2012) == 2)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_wmt.py b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..987e55676aadb77582c58b13e626d7258f3c75b5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestWMT14Train(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='train', dict_size=50)
+        self.assertTrue(len(wmt14) == 191155)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 191155)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Test(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='test', dict_size=50)
+        self.assertTrue(len(wmt14) == 5957)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5957)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Gen(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='gen', dict_size=50)
+        self.assertTrue(len(wmt14) == 3001)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3001)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Train(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(
+            mode='train', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 29000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 29000)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Test(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(
+            mode='test', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 1000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1000)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Val(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(mode='val', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 1014)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1014)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_metrics.py b/python/paddle/incubate/hapi/tests/test_metrics.py
deleted file mode 100644
index 3d25a275d5f1c539ce959c5231a7af771b229836..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/tests/test_metrics.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.base import to_variable
-
-from paddle.incubate.hapi.metrics import *
-from paddle.incubate.hapi.utils import to_list
-
-
-def accuracy(pred, label, topk=(1, )):
-    maxk = max(topk)
-    pred = np.argsort(pred)[:, ::-1][:, :maxk]
-    correct = (pred == np.repeat(label, maxk, 1))
-
-    batch_size = label.shape[0]
-    res = []
-    for k in topk:
-        correct_k = correct[:, :k].sum()
-        res.append(correct_k / batch_size)
-    return res
-
-
-def convert_to_one_hot(y, C):
-    oh = np.random.random((y.shape[0], C)).astype('float32') * .5
-    for i in range(y.shape[0]):
-        oh[i, int(y[i])] = 1.
-    return oh
-
-
-class TestAccuracyDynamic(unittest.TestCase):
-    def setUp(self):
-        self.topk = (1, )
-        self.class_num = 5
-        self.sample_num = 1000
-        self.name = None
-
-    def random_pred_label(self):
-        label = np.random.randint(0, self.class_num,
-                                  (self.sample_num, 1)).astype('int64')
-        pred = np.random.randint(0, self.class_num,
-                                 (self.sample_num, 1)).astype('int32')
-        pred_one_hot = convert_to_one_hot(pred, self.class_num)
-        pred_one_hot = pred_one_hot.astype('float32')
-
-        return label, pred_one_hot
-
-    def test_main(self):
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            acc = Accuracy(topk=self.topk, name=self.name)
-            for _ in range(10):
-                label, pred = self.random_pred_label()
-                label_var = to_variable(label)
-                pred_var = to_variable(pred)
-                state = to_list(acc.add_metric_op(pred_var, label_var))
-                acc.update(* [s.numpy() for s in state])
-                res_m = acc.accumulate()
-                res_f = accuracy(pred, label, self.topk)
-                assert np.all(np.isclose(np.array(res_m, dtype='float64'), np.array(res_f, dtype='float64'), rtol=1e-3)), \
-                        "Accuracy precision error: {} != {}".format(res_m, res_f)
-                acc.reset()
-                assert np.sum(acc.total) == 0
-                assert np.sum(acc.count) == 0
-
-
-class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-class TestAccuracyStatic(TestAccuracyDynamic):
-    def test_main(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            pred = fluid.data(
-                name='pred', shape=[None, self.class_num], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            acc = Accuracy(topk=self.topk, name=self.name)
-            state = acc.add_metric_op(pred, label)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        compiled_main_prog = fluid.CompiledProgram(main_prog)
-
-        for _ in range(10):
-            label, pred = self.random_pred_label()
-            state_ret = exe.run(compiled_main_prog,
-                                feed={'pred': pred,
-                                      'label': label},
-                                fetch_list=[s.name for s in to_list(state)],
-                                return_numpy=True)
-            acc.update(*state_ret)
-            res_m = acc.accumulate()
-            res_f = accuracy(pred, label, self.topk)
-            assert np.all(np.isclose(np.array(res_m, dtype='float64'), np.array(res_f, dtype='float64'), rtol=1e-3)), \
-                    "Accuracy precision error: {} != {}".format(res_m, res_f)
-            acc.reset()
-            assert np.sum(acc.total) == 0
-            assert np.sum(acc.count) == 0
-
-
-class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index f8be2e242568de10bfbf14fb3b88ef88fb0094da..8e0c051ee8c39c032dcc05afa466b493e1498a86 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -23,16 +23,18 @@ import shutil
 import tempfile
 
 from paddle import fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 from paddle.fluid.dygraph.base import to_variable
 
 import paddle.incubate.hapi as hapi
 from paddle.incubate.hapi import Model, Input
 from paddle.nn.layer.loss import CrossEntropyLoss
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.datasets import MNIST
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
+from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 
 class LeNetDygraph(fluid.dygraph.Layer):
@@ -40,21 +42,19 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -65,6 +65,35 @@ class LeNetDygraph(fluid.dygraph.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation=None):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -150,8 +179,8 @@ class TestModel(unittest.TestCase):
 
         cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
 
-        cls.inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
-        cls.labels = [Input('label', [None, 1], 'int64')]
+        cls.inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
+        cls.labels = [Input([None, 1], 'int64', 'label')]
 
         cls.save_dir = tempfile.mkdtemp()
         cls.weight_path = os.path.join(cls.save_dir, 'lenet')
@@ -169,6 +198,12 @@ class TestModel(unittest.TestCase):
     def test_fit_static(self):
         self.fit(False)
 
+    def test_fit_dynamic_with_rank(self):
+        self.fit(True, 2, 0)
+
+    def test_fit_static_with_rank(self):
+        self.fit(False, 2, 0)
+
     def test_evaluate_dygraph(self):
         self.evaluate(True)
 
@@ -184,7 +219,7 @@ class TestModel(unittest.TestCase):
     def test_prepare_context(self):
         prepare_distributed_context()
 
-    def fit(self, dynamic):
+    def fit(self, dynamic, num_replicas=None, rank=None):
         fluid.enable_dygraph(self.device) if dynamic else None
         seed = 333
         fluid.default_startup_program().random_seed = seed
@@ -196,7 +231,7 @@ class TestModel(unittest.TestCase):
         model = Model(net, inputs=self.inputs, labels=self.labels)
         model.prepare(
             optim_new,
-            loss_function=CrossEntropyLoss(reduction="sum"),
+            loss=CrossEntropyLoss(reduction="sum"),
             metrics=Accuracy())
         model.fit(self.train_dataset, batch_size=64, shuffle=False)
 
@@ -204,9 +239,17 @@ class TestModel(unittest.TestCase):
         np.testing.assert_allclose(result['acc'], self.acc1)
 
         train_sampler = DistributedBatchSampler(
-            self.train_dataset, batch_size=64, shuffle=False)
+            self.train_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
         val_sampler = DistributedBatchSampler(
-            self.val_dataset, batch_size=64, shuffle=False)
+            self.val_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
 
         train_loader = fluid.io.DataLoader(
             self.train_dataset,
@@ -273,10 +316,12 @@ class TestModel(unittest.TestCase):
 class MyModel(fluid.dygraph.Layer):
     def __init__(self, classifier_activation='softmax'):
         super(MyModel, self).__init__()
-        self._fc = Linear(20, 10, act=classifier_activation)
+        self._fc = Linear(20, 10)
+        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
+        y = self._act(y)
         return y
 
 
@@ -316,13 +361,11 @@ class TestModelFunction(unittest.TestCase):
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
-            inputs = [Input('x', [None, dim], 'float32')]
-            labels = [Input('label', [None, 1], 'int64')]
+            inputs = [Input([None, dim], 'float32', 'x')]
+            labels = [Input([None, 1], 'int64', 'label')]
             model = Model(net, inputs, labels)
-            model.prepare(
-                optim2, loss_function=CrossEntropyLoss(reduction="sum"))
+            model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
             loss, = model.train_batch([data], [label])
-
             np.testing.assert_allclose(loss.flatten(), ref.flatten())
             fluid.disable_dygraph() if dynamic else None
 
@@ -345,7 +388,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
             net = MyModel()
-            inputs = [Input('x', [None, dim], 'float32')]
+            inputs = [Input([None, dim], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             out, = model.test_batch([data])
@@ -359,14 +402,13 @@ class TestModelFunction(unittest.TestCase):
             device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel(classifier_activation=None)
-            inputs = [Input('x', [None, 20], 'float32')]
-            labels = [Input('label', [None, 1], 'int64')]
+            inputs = [Input([None, 20], 'float32', 'x')]
+            labels = [Input([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=net.parameters())
             model = Model(net, inputs, labels)
             model.prepare(
-                optimizer=optim,
-                loss_function=CrossEntropyLoss(reduction="sum"))
+                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
             model.save(path + '/test')
             model.load(path + '/test')
             shutil.rmtree(path)
@@ -380,18 +422,16 @@ class TestModelFunction(unittest.TestCase):
         model = Model(MyModel(classifier_activation=None))
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
         fluid.disable_dygraph()
 
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.load(path + '/test')
         shutil.rmtree(path)
 
@@ -399,26 +439,24 @@ class TestModelFunction(unittest.TestCase):
         path = tempfile.mkdtemp()
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
 
         device = hapi.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.load(path + '/test')
         shutil.rmtree(path)
         fluid.disable_dygraph()
@@ -428,7 +466,7 @@ class TestModelFunction(unittest.TestCase):
             device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel()
-            inputs = [Input('x', [None, 20], 'float32')]
+            inputs = [Input([None, 20], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             params = model.parameters()
@@ -437,33 +475,48 @@ class TestModelFunction(unittest.TestCase):
             fluid.disable_dygraph() if dynamic else None
 
     def test_export_deploy_model(self):
-        net = LeNet()
-        inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
-        model = Model(net, inputs)
-        model.prepare()
-        save_dir = tempfile.mkdtemp()
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        tensor_img = np.array(
-            np.random.random((1, 1, 28, 28)), dtype=np.float32)
-        ori_results = model.test_batch(tensor_img)
-
-        model.save_inference_model(save_dir)
-
-        place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=save_dir, executor=exe))
+        for dynamic in [True, False]:
+            fluid.enable_dygraph() if dynamic else None
+            # paddle.disable_static() if dynamic else None
+            prog_translator = ProgramTranslator()
+            prog_translator.enable(False) if not dynamic else None
+            net = LeNetDeclarative()
+            inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            save_dir = tempfile.mkdtemp()
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            tensor_img = np.array(
+                np.random.random((1, 1, 28, 28)), dtype=np.float32)
+            ori_results = model.test_batch(tensor_img)
+            model.save(save_dir, training=False)
+            fluid.disable_dygraph() if dynamic else None
 
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
+            place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            new_scope = fluid.Scope()
+            with fluid.scope_guard(new_scope):
+                exe = fluid.Executor(place)
+                [inference_program, feed_target_names, fetch_targets] = (
+                    fluid.io.load_inference_model(
+                        dirname=save_dir, executor=exe))
+                results = exe.run(inference_program,
+                                  feed={feed_target_names[0]: tensor_img},
+                                  fetch_list=fetch_targets)
+                np.testing.assert_allclose(
+                    results, ori_results, rtol=1e-5, atol=1e-7)
+                shutil.rmtree(save_dir)
+
+
+class TestRaiseError(unittest.TestCase):
+    def test_input_without_name(self):
+        net = MyModel(classifier_activation=None)
 
-        np.testing.assert_allclose(results, ori_results, rtol=1e-6)
-        shutil.rmtree(save_dir)
+        inputs = [Input([None, 10], 'float32')]
+        labels = [Input([None, 1], 'int64', 'label')]
+        with self.assertRaises(ValueError):
+            model = Model(net, inputs, labels)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/incubate/hapi/tests/test_pretrained_model.py b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
index 588797322f4ab8e9eef9cc184cc6d82635de7d01..334ebff449d4f34c9a5a9b56ee7998b4dbc5abf0 100644
--- a/python/paddle/incubate/hapi/tests/test_pretrained_model.py
+++ b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
@@ -28,7 +28,7 @@ class TestPretrainedModel(unittest.TestCase):
             fluid.enable_dygraph()
 
         net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [Input('image', [None, 3, 224, 224], 'float32')]
+        inputs = [Input([None, 3, 224, 224], 'float32', 'image')]
         model = Model(network=net, inputs=inputs)
         model.prepare()
         res = model.test_batch(x)
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/incubate/hapi/tests/test_text.py
index 78f089b06a38dec4eb189a9744e503f517f220db..bdc637997b0cbd8389fdfab9f71597c62b0e21a3 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
@@ -142,7 +142,7 @@ class TestBasicLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -168,7 +168,7 @@ class TestBasicGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -219,8 +219,8 @@ class TestBeamSearch(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("init_hidden", [None, self.inputs[0].shape[-1]], "float32"),
-            Input("init_cell", [None, self.inputs[1].shape[-1]], "float32"),
+            Input([None, self.inputs[0].shape[-1]], "float32", "init_hidden"),
+            Input([None, self.inputs[1].shape[-1]], "float32", "init_cell"),
         ]
         return inputs
 
@@ -272,10 +272,10 @@ class TestTransformerEncoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("enc_input", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("attn_bias", [None, self.inputs[1].shape[1], None, None],
-                  "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_input"),
+            Input([None, self.inputs[1].shape[1], None, None], "float32",
+                  "attn_bias"),
         ]
         return inputs
 
@@ -336,14 +336,14 @@ class TestTransformerDecoder(TestTransformerEncoder):
 
     def make_inputs(self):
         inputs = [
-            Input("dec_input", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("self_attn_bias",
-                  [None, self.inputs[-1].shape[1], None, None], "float32"),
-            Input("cross_attn_bias",
-                  [None, self.inputs[-1].shape[1], None, None], "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "dec_input"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_output"),
+            Input([None, self.inputs[-1].shape[1], None, None], "float32",
+                  "self_attn_bias"),
+            Input([None, self.inputs[-1].shape[1], None, None], "float32",
+                  "cross_attn_bias"),
         ]
         return inputs
 
@@ -431,10 +431,10 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("trg_src_attn_bias",
-                  [None, self.inputs[1].shape[1], None, None], "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_output"),
+            Input([None, self.inputs[1].shape[1], None, None], "float32",
+                  "trg_src_attn_bias"),
         ]
         return inputs
 
@@ -473,9 +473,9 @@ class TestSequenceTagging(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("word", [None, None], "int64"),
-            Input("lengths", [None], "int64"),
-            Input("target", [None, None], "int64"),
+            Input([None, None], "int64", "word"),
+            Input([None], "int64", "lengths"),
+            Input([None, None], "int64", "target"),
         ]
         return inputs
 
@@ -517,7 +517,7 @@ class TestStackedRNN(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -543,7 +543,7 @@ class TestLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -579,7 +579,7 @@ class TestBiLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -609,7 +609,7 @@ class TestGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -645,7 +645,7 @@ class TestBiGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -680,7 +680,7 @@ class TestCNNEncoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, self.inputs[-1].shape[1], None], "float32"),
+            Input([None, self.inputs[-1].shape[1], None], "float32", "input"),
         ]
         return inputs
 
diff --git a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
index c2035a8b5c5958d54c79d6ee0ff6df654bb35d51..6df9b31217aae78c43de8d29956a8b2def99055b 100644
--- a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
+++ b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 
 from paddle import fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 
 from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
 
@@ -32,21 +32,19 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/incubate/hapi/tests/test_vision_models.py b/python/paddle/incubate/hapi/tests/test_vision_models.py
index 16dbe431be801c9cd7ce48c4cd1444b7e0e558a4..2dc9355bcc3005d48b7046123b024fa2a91594c3 100644
--- a/python/paddle/incubate/hapi/tests/test_vision_models.py
+++ b/python/paddle/incubate/hapi/tests/test_vision_models.py
@@ -28,7 +28,7 @@ class TestVisonModels(unittest.TestCase):
         else:
             net = models.__dict__[arch](pretrained=pretrained)
 
-        input = hapi.Input('image', [None, 3, 224, 224], 'float32')
+        input = hapi.Input([None, 3, 224, 224], 'float32', 'image')
         model = hapi.Model(net, input)
         model.prepare()
 
@@ -71,7 +71,7 @@ class TestVisonModels(unittest.TestCase):
         self.models_infer('resnet152')
 
     def test_lenet(self):
-        input = hapi.Input('x', [None, 1, 28, 28], 'float32')
+        input = hapi.Input([None, 1, 28, 28], 'float32', 'x')
         lenet = hapi.Model(models.__dict__['LeNet'](), input)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/text/text.py b/python/paddle/incubate/hapi/text/text.py
index 1424ce0381ac22e3fc15db854e653e0c2632cf22..a2940fbe6cf483bce905c596a4b50294129fab54 100644
--- a/python/paddle/incubate/hapi/text/text.py
+++ b/python/paddle/incubate/hapi/text/text.py
@@ -1804,7 +1804,7 @@ class DynamicDecode(Layer):
             from paddle.fluid.layers import BeamSearchDecoder
             from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
 
-            paddle.enable_dygraph()
+            paddle.disable_static()
 
             vocab_size, d_model, = 100, 32
             encoder_output = paddle.rand((2, 4, d_model))
@@ -2278,7 +2278,7 @@ class TransformerCell(RNNCell):
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
             from paddle.incubate.hapi.text import DynamicDecode
 
-            paddle.enable_dygraph()
+            paddle.disable_static()
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
@@ -2445,7 +2445,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
             from paddle.incubate.hapi.text import DynamicDecode
 
-            paddle.enable_dygraph()
+            paddle.disable_static()
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
diff --git a/python/paddle/incubate/hapi/vision/models/lenet.py b/python/paddle/incubate/hapi/vision/models/lenet.py
index db1d894b4aa5f2535795c6350faad6ee3aee1164..169f70562f6edfe1773a1c8d75004c25831cedcb 100644
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/incubate/hapi/vision/models/lenet.py
@@ -13,7 +13,7 @@
 #limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 
 __all__ = ['LeNet']
 
@@ -39,21 +39,19 @@ class LeNet(fluid.dygraph.Layer):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/incubate/hapi/vision/models/vgg.py b/python/paddle/incubate/hapi/vision/models/vgg.py
index 74e7228e5249fe990d037c9f12e75b6d4839c591..4352a768eb7206ca30acead580a64a7d04b7701b 100644
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/incubate/hapi/vision/models/vgg.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
 from paddle.fluid.dygraph.container import Sequential
 
 from ...download import get_weights_path_from_url
@@ -37,7 +37,8 @@ class Classifier(fluid.dygraph.Layer):
         super(Classifier, self).__init__()
         self.linear1 = Linear(512 * 7 * 7, 4096)
         self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes, act=classifier_activation)
+        self.linear3 = Linear(4096, num_classes)
+        self.act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         x = self.linear1(x)
@@ -46,7 +47,8 @@ class Classifier(fluid.dygraph.Layer):
         x = self.linear2(x)
         x = fluid.layers.relu(x)
         x = fluid.layers.dropout(x, 0.5)
-        out = self.linear3(x)
+        x = self.linear3(x)
+        out = self.act(x)
         return out
 
 
@@ -105,12 +107,11 @@ def make_layers(cfg, batch_norm=False):
             layers += [Pool2D(pool_size=2, pool_stride=2)]
         else:
             if batch_norm:
-                conv2d = Conv2D(in_channels, v, filter_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v, act='relu')]
+                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, BatchNorm(v), ReLU()]
             else:
-                conv2d = Conv2D(
-                    in_channels, v, filter_size=3, padding=1, act='relu')
-                layers += [conv2d]
+                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, ReLU()]
             in_channels = v
     return Sequential(*layers)
 
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index f9c083d2aeeeedb98b7d0ea2364dbb278ed26282..78f792d6a5a6698034912297f5d5a23db0b35201 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -15,9 +15,15 @@
 # TODO: define all functions about input & output in this directory 
 __all__ = [
     'Dataset',
+    'IterableDataset',
+    'TensorDataset',
     'BatchSampler',
     #            'Transform',
     'DataLoader',
+    'get_worker_info',
+    'Sampler',
+    'SequenceSampler',
+    'RandomSampler',
     'load',
     'save',
     'load_program_state',
@@ -36,7 +42,8 @@ __all__ = [
 ]
 
 from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, BatchSampler
+from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
+        TensorDataset, Sampler, SequenceSampler, RandomSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
 from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03299a3bb9823d31c40ae4faab601ed89570c71e
--- /dev/null
+++ b/python/paddle/jit/__init__.py
@@ -0,0 +1,28 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import declarative as to_static  #DEFINE_ALIAS
+from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
+from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
+
+__all__ = [
+    'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
+    'ProgramTranslator', 'TranslatedLayer', 'set_code_level', 'set_verbosity'
+]
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index e03336f6dbab7b375701e1e694aee0bbbfa4b1cd..6e197881fc0bcbc32f9d9d738237082138f9410b 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to calculate metric in this directory 
-__all__ = [
-    'Accuracy', 'Auc', 'ChunkEvaluator', 'CompositeMetric', 'DetectionMAP',
-    'EditDistance', 'Precision', 'Recall', 'accuracy', 'auc', 'chunk_eval',
-    'cos_sim', 'mean_iou'
-]
-
-
-
-from ..fluid.metrics import Accuracy, Auc, ChunkEvaluator, CompositeMetric, DetectionMAP, EditDistance, \
-        Precision, Recall
+from .metrics import *
+from . import metrics
 
 from ..fluid.layers.metric_op import accuracy, auc
 from ..fluid.layers.nn import chunk_eval, cos_sim, mean_iou
+
+__all__ = metrics.__all__ + [
+    'accuracy',
+    'auc',
+    'chunk_eval',
+    'cos_sim',
+    'mean_iou',
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..110a62c300559b9037cd2ca735aebd1946ba0ce9
--- /dev/null
+++ b/python/paddle/metric/metrics.py
@@ -0,0 +1,738 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import abc
+import numpy as np
+
+import paddle
+
+__all__ = ['Metric', 'Accuracy', 'Precision', 'Recall', 'Auc']
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+    """
+    Base class for metric, encapsulates metric logic and APIs
+    Usage:
+        
+        m = SomeMetric()
+        for prediction, label in ...:
+            m.update(prediction, label)
+        m.accumulate()
+        
+    Advanced usage for :code:`compute`:
+
+    Metric calculation can be accelerated by calculating metric states
+    from model outputs and labels by build-in operators not by Python/NumPy
+    in :code:`compute`, metric states will be fetched as NumPy array and
+    call :code:`update` with states in NumPy format.
+    Metric calculated as follows (operations in Model and Metric are
+    indicated with curly brackets, while data nodes not):
+                 inputs & labels              || ------------------
+                       |                      ||
+                    {model}                   ||
+                       |                      ||
+                outputs & labels              ||
+                       |                      ||    tensor data
+                {Metric.compute}              ||
+                       |                      ||
+              metric states(tensor)           ||
+                       |                      ||
+                {fetch as numpy}              || ------------------
+                       |                      ||
+              metric states(numpy)            ||    numpy data
+                       |                      ||
+                {Metric.update}               \/ ------------------
+    Examples:
+        
+        For :code:`Accuracy` metric, which takes :code:`pred` and :code:`label`
+        as inputs, we can calculate the correct prediction matrix between
+        :code:`pred` and :code:`label` in :code:`compute`.
+        For examples, prediction results contains 10 classes, while :code:`pred`
+        shape is [N, 10], :code:`label` shape is [N, 1], N is mini-batch size,
+        and we only need to calculate accurary of top-1 and top-5, we could
+        calculate the correct prediction matrix of the top-5 scores of the
+        prediction of each sample like follows, while the correct prediction
+        matrix shape is [N, 5].
+
+        .. code-block:: python
+            def compute(pred, label):
+                # sort prediction and slice the top-5 scores
+                pred = paddle.argsort(pred, descending=True)[:, :5]
+                # calculate whether the predictions are correct
+                correct = pred == label
+                return paddle.cast(correct, dtype='float32')
+
+        With the :code:`compute`, we split some calculations to OPs (which
+        may run on GPU devices, will be faster), and only fetch 1 tensor with
+        shape as [N, 5] instead of 2 tensors with shapes as [N, 10] and [N, 1].
+        :code:`update` can be define as follows:
+
+        .. code-block:: python
+            def update(self, correct):
+                accs = []
+                for i, k in enumerate(self.topk):
+                    num_corrects = correct[:, :k].sum()
+                    num_samples = len(correct)
+                    accs.append(float(num_corrects) / num_samples)
+                    self.total[i] += num_corrects
+                    self.count[i] += num_samples
+                return accs
+    """
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset states and result
+        """
+        raise NotImplementedError("function 'reset' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def update(self, *args):
+        """
+        Update states for metric
+
+        Inputs of :code:`update` is the outputs of :code:`Metric.compute`,
+        if :code:`compute` is not defined, the inputs of :code:`update`
+        will be flatten arguments of **output** of mode and **label** from data:
+        :code:`update(output1, output2, ..., label1, label2,...)`
+
+        see :code:`Metric.compute`
+        """
+        raise NotImplementedError("function 'update' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def accumulate(self):
+        """
+        Accumulates statistics, computes and returns the metric value
+        """
+        raise NotImplementedError(
+            "function 'accumulate' not implemented in {}.".format(
+                self.__class__.__name__))
+
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns metric name
+        """
+        raise NotImplementedError("function 'name' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    def compute(self, *args):
+        """
+        This API is advanced usage to accelerate metric calculating, calulations
+        from outputs of model to the states which should be updated by Metric can
+        be defined here, where Paddle OPs is also supported. Outputs of this API
+        will be the inputs of "Metric.update".
+
+        If :code:`compute` is defined, it will be called with **outputs**
+        of model and **labels** from data as arguments, all outputs and labels
+        will be concatenated and flatten and each filed as a separate argument
+        as follows:
+        :code:`compute(output1, output2, ..., label1, label2,...)`
+
+        If :code:`compute` is not defined, default behaviour is to pass
+        input to output, so output format will be:
+        :code:`return output1, output2, ..., label1, label2,...`
+
+        see :code:`Metric.update`
+        """
+        return args
+
+
+class Accuracy(Metric):
+    """
+    Encapsulates accuracy metric logic.
+
+    Args:
+        topk (int|tuple(int)): Number of top elements to look at
+            for computing accuracy. Default is (1,).
+        name (str, optional): String name of the metric instance. Default
+            is `acc`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([
+            [0.1, 0.2, 0.3, 0.4],
+            [0.1, 0.4, 0.3, 0.2],
+            [0.1, 0.2, 0.4, 0.3],
+            [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        m = paddle.metric.Accuracy()
+        correct = m.compute(x, y)
+        m.update(correct)
+        res = m.accumulate()
+        print(res) # 0.75
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import paddle
+        import paddle.incubate.hapi as hapi
+
+        paddle.disable_static()
+        train_dataset = hapi.datasets.MNIST(mode='train')
+
+        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=paddle.nn.CrossEntropyLoss(),
+            metrics=paddle.metric.Accuracy())
+
+        model.fit(train_dataset, batch_size=64)
+
+    """
+
+    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
+        super(Accuracy, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.maxk = max(topk)
+        self._init_name(name)
+        self.reset()
+
+    def compute(self, pred, label, *args):
+        """
+        Compute the top-k (maxinum value in `topk`) indices.
+
+        Args:
+            pred (Tensor): The predicted value is a Tensor wit type
+                float32 or float64.
+            label (Tensor): The ground truth value is a 2D Tensor, its
+                shape is [batch_size, 1] and type is int64.
+
+        Return:
+            Tensor: Correct mask, a tensor with shape [batch_size, topk].
+        """
+        pred = paddle.argsort(pred, descending=True)[:, :self.maxk]
+        correct = pred == label
+        return paddle.cast(correct, dtype='float32')
+
+    def update(self, correct, *args):
+        """
+        Update the metrics states (correct count and total count), in order to
+        calculate cumulative accuracy of all instances. This function also
+        returns the accuracy of current step.
+        
+        Args:
+            correct: Correct mask, a tensor with shape [batch_size, topk].
+
+        Return:
+            Tensor: the accuracy of current step.
+        """
+        if isinstance(correct, paddle.Tensor):
+            correct = correct.numpy()
+        accs = []
+        for i, k in enumerate(self.topk):
+            num_corrects = correct[:, :k].sum()
+            num_samples = len(correct)
+            accs.append(float(num_corrects) / num_samples)
+            self.total[i] += num_corrects
+            self.count[i] += num_samples
+        accs = accs[0] if len(self.topk) == 1 else accs
+        return accs
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.total = [0.] * len(self.topk)
+        self.count = [0] * len(self.topk)
+
+    def accumulate(self):
+        """
+        Computes and returns the accumulated metric.
+        """
+        res = []
+        for t, c in zip(self.total, self.count):
+            r = float(t) / c if c > 0 else 0.
+            res.append(r)
+        res = res[0] if len(self.topk) == 1 else res
+        return res
+
+    def _init_name(self, name):
+        name = name or 'acc'
+        if self.maxk != 1:
+            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
+        else:
+            self._name = [name]
+
+    def name(self):
+        """
+        Return name of metric instance.
+        """
+        return self._name
+
+
+class Precision(Metric):
+    """
+    Precision (also called positive predictive value) is the fraction of
+    relevant instances among the retrieved instances. Refer to
+    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
+
+    Noted that this class manages the precision score only for binary
+    classification task.
+
+    Args:
+        name (str, optional): String name of the metric instance.
+            Default is `precision`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([0, 1, 1, 1])
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        res = m.accumulate()
+        print(res) # 1.0
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+  
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 1),
+            nn.Sigmoid()
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=nn.BCELoss(),
+            metrics=paddle.metric.Precision())
+        
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self, name='precision', *args, **kwargs):
+        super(Precision, self).__init__(*args, **kwargs)
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the states based on the current mini-batch prediction results.
+
+        Args:
+            preds (numpy.ndarray): The prediction result, usually the output
+                of two-class sigmoid function. It should be a vector (column
+                vector or row vector) with data type: 'float64' or 'float32'.
+            labels (numpy.ndarray): The ground truth (labels),
+                the shape should keep the same as preds.
+                The data type is 'int32' or 'int64'.
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        sample_num = labels.shape[0]
+        preds = np.floor(preds + 0.5).astype("int32")
+
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if pred == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fp = 0
+
+    def accumulate(self):
+        """
+        Calculate the final precision.
+
+        Returns:
+            A scaler float: results of the calculated precision.
+        """
+        ap = self.tp + self.fp
+        return float(self.tp) / ap if ap != 0 else .0
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
+
+
+class Recall(Metric):
+    """
+    Recall (also known as sensitivity) is the fraction of
+    relevant instances that have been retrieved over the
+    total amount of relevant instances
+
+    Refer to:
+    https://en.wikipedia.org/wiki/Precision_and_recall
+
+    Noted that this class manages the recall score only for
+    binary classification task.
+
+    Args:
+        name (str, optional): String name of the metric instance.
+            Default is `recall`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Recall()
+        m.update(x, y)
+        res = m.accumulate()
+        print(res) # 2.0 / 3.0
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+        
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 1),
+            nn.Sigmoid()
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=nn.BCELoss(),
+            metrics=[paddle.metric.Precision(), paddle.metric.Recall()])
+        
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self, name='recall', *args, **kwargs):
+        super(Recall, self).__init__(*args, **kwargs)
+        self.tp = 0  # true positive
+        self.fn = 0  # false negative
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the states based on the current mini-batch prediction results.
+
+        Args:
+            preds(numpy.array): prediction results of current mini-batch,
+                the output of two-class sigmoid function.
+                Shape: [batch_size, 1]. Dtype: 'float64' or 'float32'.
+            labels(numpy.array): ground truth (labels) of current mini-batch,
+                the shape should keep the same as preds.
+                Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        sample_num = labels.shape[0]
+        preds = np.rint(preds).astype("int32")
+
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fn += 1
+
+    def accumulate(self):
+        """
+        Calculate the final recall.
+
+        Returns:
+            A scaler float: results of the calculated Recall.
+        """
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else .0
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fn = 0
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
+
+
+class Auc(Metric):
+    """
+    The auc metric is for binary classification.
+    Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve.
+    Please notice that the auc metric is implemented with python, which may be a little bit slow.
+
+    The `auc` function creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the AUC. To discretize the AUC curve, a linearly spaced set of
+    thresholds is used to compute pairs of recall and precision values. The area
+    under the ROC-curve is therefore computed using the height of the recall
+    values by the false positive rate, while the area under the PR-curve is the
+    computed using the height of the precision values by the recall.
+
+    Args:
+        curve (str): Specifies the mode of the curve to be computed,
+            'ROC' or 'PR' for the Precision-Recall-curve. Default is 'ROC'.
+        num_thresholds (int): The number of thresholds to use when
+            discretizing the roc curve. Default is 4095.
+            'ROC' or 'PR' for the Precision-Recall-curve. Default is 'ROC'.
+        name (str, optional): String name of the metric instance. Default
+            is `auc`.
+
+    "NOTE: only implement the ROC curve type via Python now."
+
+    Example by standalone:
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        m = paddle.metric.Auc()
+        
+        n = 8
+        class0_preds = np.random.random(size = (n, 1))
+        class1_preds = 1 - class0_preds
+        
+        preds = np.concatenate((class0_preds, class1_preds), axis=1)
+        labels = np.random.randint(2, size = (n, 1))
+        
+        m.update(preds=preds, labels=labels)
+        res = m.accumulate()
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('int64')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+        
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 2, act='softmax'),
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        
+        def loss(x, y):
+            return nn.functional.nll_loss(paddle.log(x), y)
+        
+        model.prepare(
+            optim,
+            loss=loss,
+            metrics=paddle.metric.Auc())
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self,
+                 curve='ROC',
+                 num_thresholds=4095,
+                 name='auc',
+                 *args,
+                 **kwargs):
+        super(Auc, self).__init__(*args, **kwargs)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the auc curve with the given predictions and labels.
+
+        Args:
+            preds (numpy.array): An numpy array in the shape of
+                (batch_size, 2), preds[i][j] denotes the probability of
+                classifying the instance i into the class j.
+            labels (numpy.array): an numpy array in the shape of
+                (batch_size, 1), labels[i] is either o or 1,
+                representing the label of the instance i.
+        """
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
+            if lbl:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
+
+    def accumulate(self):
+        """
+        Return the area (a float score) under auc curve
+
+        Return:
+            float: the area under auc curve
+        """
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+
+    def reset(self):
+        """
+        Reset states and result
+        """
+        _num_pred_buckets = self._num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 98948fa91e2e82ce0c566657114abfe79a5a7dc1..76063458d44de3000ad7c1af08376c07e0209c27 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -18,6 +18,8 @@
 from .layer import norm
 from .functional import extension
 from .layer import common
+from .layer import rnn
+from .utils import weight_norm_hook
 
 from . import initializer
 
@@ -25,6 +27,8 @@ __all__ = []
 __all__ += norm.__all__
 __all__ += extension.__all__
 __all__ += common.__all__
+__all__ += rnn.__all__
+__all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
 # from .clip import ErrorClipByValue        #DEFINE_ALIAS
@@ -49,26 +53,58 @@ from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
-from .input import data  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
-# from .layer.activation import PReLU        #DEFINE_ALIAS
-from .layer.activation import ReLU  #DEFINE_ALIAS
+from .layer.activation import ELU
+from .layer.activation import GELU
+from .layer.activation import Tanh
+from .layer.activation import Hardshrink
+from .layer.activation import Hardtanh
+from .layer.activation import PReLU
+from .layer.activation import ReLU
+from .layer.activation import ReLU6  #DEFINE_ALIAS
+from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
-# from .layer.activation import Softmax        #DEFINE_ALIAS
+from .layer.activation import LogSigmoid
+from .layer.activation import Softmax  #DEFINE_ALIAS
+from .layer.activation import Softplus  #DEFINE_ALIAS
+from .layer.activation import Softshrink  #DEFINE_ALIAS
+from .layer.activation import Softsign  #DEFINE_ALIAS
+from .layer.activation import Tanhshrink  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 from .layer.activation import HSigmoid  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
+from .layer.common import ReflectionPad1d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad1d  #DEFINE_ALIAS
+from .layer.common import ConstantPad1d  #DEFINE_ALIAS
+from .layer.common import ReflectionPad2d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad2d  #DEFINE_ALIAS
+from .layer.common import ConstantPad2d  #DEFINE_ALIAS
+from .layer.common import ZeroPad2d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad3d  #DEFINE_ALIAS
+from .layer.common import ConstantPad3d  #DEFINE_ALIAS
+from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
 from .layer.common import UpSample  #DEFINE_ALIAS
-from .layer.conv import Conv2D  #DEFINE_ALIAS
-from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv3D  #DEFINE_ALIAS
-from .layer.conv import Conv3DTranspose  #DEFINE_ALIAS
+from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
+from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
+from .layer.common import Bilinear  #DEFINE_ALIAS
+from .layer.common import Dropout  #DEFINE_ALIAS
+from .layer.common import Dropout2D  #DEFINE_ALIAS
+from .layer.common import Dropout3D  #DEFINE_ALIAS
+from .layer.common import AlphaDropout  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .layer.conv import Conv1d  #DEFINE_ALIAS
+from .layer.conv import Conv2d  #DEFINE_ALIAS
+from .layer.conv import Conv3d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose1d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose2d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose3d  #DEFINE_ALIAS
 # from .layer.conv import TreeConv        #DEFINE_ALIAS
 # from .layer.conv import Conv1D        #DEFINE_ALIAS
 from .layer.extension import RowConv  #DEFINE_ALIAS
@@ -79,22 +115,46 @@ from .layer.extension import RowConv  #DEFINE_ALIAS
 # from .layer.learning_rate import NoamDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PolynomialDecay        #DEFINE_ALIAS
+from .layer.common import Linear
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
+from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
 from .layer.loss import MSELoss  #DEFINE_ALIAS
 from .layer.loss import L1Loss  #DEFINE_ALIAS
 from .layer.loss import NLLLoss  #DEFINE_ALIAS
 from .layer.loss import BCELoss  #DEFINE_ALIAS
+from .layer.loss import KLDivLoss  #DEFINE_ALIAS
+from .layer.loss import MarginRankingLoss  #DEFINE_ALIAS
+from .layer.loss import CTCLoss  #DEFINE_ALIAS
+from .layer.loss import SmoothL1Loss  #DEFINE_ALIAS
 from .layer.norm import BatchNorm  #DEFINE_ALIAS
+from .layer.norm import SyncBatchNorm  #DEFINE_ALIAS
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
+from .layer.rnn import *
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
+from .layer.transformer import MultiHeadAttention
+from .layer.transformer import TransformerEncoderLayer
+from .layer.transformer import TransformerEncoder
+from .layer.transformer import TransformerDecoderLayer
+from .layer.transformer import TransformerDecoder
+from .layer.transformer import Transformer
+from .layer.distance import PairwiseDistance  #DEFINE_ALIAS
+
+from .layer.vision import PixelShuffle
 
 from .layer import loss  #DEFINE_ALIAS
 from .layer import conv  #DEFINE_ALIAS
+from .layer import vision  #DEFINE_ALIAS
 from ..fluid.dygraph.layers import Layer  #DEFINE_ALIAS
 from ..fluid.dygraph.container import LayerList, ParameterList, Sequential  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 3fefb1b053ee88b01b3a3ca71918ffafdeb71ff2..414e70853eb7163230ab2db987fc19c58e168f19 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -25,18 +25,23 @@ from . import extension
 __all__ += extension.__all__
 from . import common
 __all__ += common.__all__
+from . import pooling
+__all__ += pooling.__all__
+from . import loss
+__all__ += loss.__all__
 from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
-from .activation import hard_shrink  #DEFINE_ALIAS
+from .activation import hardshrink  #DEFINE_ALIAS
+from .activation import hardtanh  #DEFINE_ALIAS
 from .activation import hard_sigmoid  #DEFINE_ALIAS
 from .activation import hard_swish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import logsigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
-# from .activation import prelu        #DEFINE_ALIAS
+from .activation import prelu  #DEFINE_ALIAS
 from .activation import relu  #DEFINE_ALIAS
 from .activation import relu6  #DEFINE_ALIAS
 from .activation import selu  #DEFINE_ALIAS
@@ -47,10 +52,14 @@ from .activation import softplus  #DEFINE_ALIAS
 from .activation import softshrink  #DEFINE_ALIAS
 from .activation import softsign  #DEFINE_ALIAS
 from .activation import swish  #DEFINE_ALIAS
-from .activation import tanh_shrink  #DEFINE_ALIAS
+from .activation import tanh  #DEFINE_ALIAS
+from .activation import tanhshrink  #DEFINE_ALIAS
 from .activation import thresholded_relu  #DEFINE_ALIAS
 from .activation import log_softmax  #DEFINE_ALIAS
 from .common import dropout  #DEFINE_ALIAS
+from .common import dropout2d  #DEFINE_ALIAS
+from .common import dropout3d  #DEFINE_ALIAS
+from .common import alpha_dropout  #DEFINE_ALIAS
 # from .common import embedding        #DEFINE_ALIAS
 # from .common import fc  #DEFINE_ALIAS
 from .common import label_smooth  #DEFINE_ALIAS
@@ -58,14 +67,19 @@ from .common import one_hot  #DEFINE_ALIAS
 from .common import pad  #DEFINE_ALIAS
 from .common import pad_constant_like  #DEFINE_ALIAS
 from .common import pad2d  #DEFINE_ALIAS
+from .common import cosine_similarity  #DEFINE_ALIAS
 from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
 from .common import assign  #DEFINE_ALIAS
 from .common import interpolate  #DEFINE_ALIAS
+from .common import bilinear  #DEFINE_ALIAS
+from .conv import conv1d  #DEFINE_ALIAS
+from .conv import conv_transpose1d  #DEFINE_ALIAS
+from .common import linear  #DEFINE_ALIAS
 from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv2d_transpose  #DEFINE_ALIAS
+from .conv import conv_transpose2d  #DEFINE_ALIAS
 from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv3d_transpose  #DEFINE_ALIAS
+from .conv import conv_transpose3d  #DEFINE_ALIAS
 from .extension import add_position_encoding  #DEFINE_ALIAS
 # from .extension import autoincreased_step_counter        #DEFINE_ALIAS
 from .extension import continuous_value_model  #DEFINE_ALIAS
@@ -119,6 +133,8 @@ from .lod import hash  #DEFINE_ALIAS
 # from .lod import dynamic_gru        #DEFINE_ALIAS
 # from .lod import dynamic_lstm        #DEFINE_ALIAS
 # from .lod import dynamic_lstmp        #DEFINE_ALIAS
+from .loss import binary_cross_entropy  #DEFINE_ALIAS
+from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import bpr_loss  #DEFINE_ALIAS
 from .loss import center_loss  #DEFINE_ALIAS
 from .loss import cross_entropy  #DEFINE_ALIAS
@@ -126,10 +142,12 @@ from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import edit_distance  #DEFINE_ALIAS
 from .loss import huber_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
-from .loss import kldiv_loss  #DEFINE_ALIAS
+from .loss import kl_div  #DEFINE_ALIAS
+from .loss import l1_loss  #DEFINE_ALIAS
 from .loss import log_loss  #DEFINE_ALIAS
-from .loss import margin_rank_loss  #DEFINE_ALIAS
+from .loss import margin_ranking_loss  #DEFINE_ALIAS
 from .loss import mse_loss  #DEFINE_ALIAS
+from .loss import nll_loss  #DEFINE_ALIAS
 # from .loss import nce        #DEFINE_ALIAS
 from .loss import npair_loss  #DEFINE_ALIAS
 from .loss import rank_loss  #DEFINE_ALIAS
@@ -137,22 +155,37 @@ from .loss import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import sigmoid_focal_loss  #DEFINE_ALIAS
 from .loss import smooth_l1  #DEFINE_ALIAS
+from .loss import smooth_l1_loss  #DEFINE_ALIAS
 from .loss import softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
-# from .norm import batch_norm        #DEFINE_ALIAS
+from .loss import ctc_loss  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
-# from .norm import instance_norm        #DEFINE_ALIAS
 from .norm import l2_normalize  #DEFINE_ALIAS
-# from .norm import layer_norm        #DEFINE_ALIAS
+from .norm import batch_norm  #DEFINE_ALIAS
+from .norm import instance_norm  #DEFINE_ALIAS
+from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
+from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
+from .pooling import avg_pool2d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
+from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
@@ -164,7 +197,7 @@ from .vision import box_clip  #DEFINE_ALIAS
 from .vision import box_coder  #DEFINE_ALIAS
 from .vision import box_decoder_and_assign  #DEFINE_ALIAS
 from .vision import collect_fpn_proposals  #DEFINE_ALIAS
-# from .vision import deformable_conv        #DEFINE_ALIAS
+# from .vision import deformable_conv  #DEFINE_ALIAS
 from .vision import deformable_roi_pooling  #DEFINE_ALIAS
 from .vision import density_prior_box  #DEFINE_ALIAS
 from .vision import detection_output  #DEFINE_ALIAS
@@ -173,10 +206,10 @@ from .vision import fsp_matrix  #DEFINE_ALIAS
 from .vision import generate_mask_labels  #DEFINE_ALIAS
 from .vision import generate_proposal_labels  #DEFINE_ALIAS
 from .vision import generate_proposals  #DEFINE_ALIAS
-from .vision import grid_sampler  #DEFINE_ALIAS
+from .vision import grid_sample  #DEFINE_ALIAS
 from .vision import image_resize  #DEFINE_ALIAS
 from .vision import image_resize_short  #DEFINE_ALIAS
-# from .vision import multi_box_head        #DEFINE_ALIAS
+# from .vision import multi_box_head  #DEFINE_ALIAS
 from .vision import pixel_shuffle  #DEFINE_ALIAS
 from .vision import prior_box  #DEFINE_ALIAS
 from .vision import prroi_pool  #DEFINE_ALIAS
@@ -193,3 +226,4 @@ from .vision import shuffle_channel  #DEFINE_ALIAS
 from .vision import space_to_depth  #DEFINE_ALIAS
 from .vision import yolo_box  #DEFINE_ALIAS
 from .vision import yolov3_loss  #DEFINE_ALIAS
+from .input import one_hot  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index dad6b252ed4d54db4a85d1912503ce5401a1ca4c..ffedb027330bda94db86dc0943a5c4a7281f254f 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -14,59 +14,255 @@
 
 # TODO: define activation functions of neural network
 from ...fluid.layers import brelu  #DEFINE_ALIAS
-from ...fluid.layers import elu  #DEFINE_ALIAS
 from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import gelu  #DEFINE_ALIAS
-from ...fluid.layers import hard_shrink  #DEFINE_ALIAS
 from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
 from ...fluid.layers import hard_swish  #DEFINE_ALIAS
-from ...fluid.layers import leaky_relu  #DEFINE_ALIAS
-from ...fluid.layers import logsigmoid  #DEFINE_ALIAS
 from ...fluid.layers import maxout  #DEFINE_ALIAS
-from ...fluid.layers import relu6  #DEFINE_ALIAS
-from ...fluid.layers import selu  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import softmax  #DEFINE_ALIAS
-from ...fluid.layers import softplus  #DEFINE_ALIAS
-from ...fluid.layers import softshrink  #DEFINE_ALIAS
-from ...fluid.layers import softsign  #DEFINE_ALIAS
 from ...fluid.layers import swish  #DEFINE_ALIAS
-from ...fluid.layers import tanh_shrink  #DEFINE_ALIAS
+from ...fluid.layers import sigmoid  #DEFINE_ALIAS
 from ...fluid.layers import thresholded_relu  #DEFINE_ALIAS
+from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
     'brelu',
     'elu',
     'erf',
     'gelu',
-    'hard_shrink',
+    'hardshrink',
+    'hardtanh',
     'hard_sigmoid',
     'hard_swish',
     'hsigmoid',
     'leaky_relu',
     'logsigmoid',
     'maxout',
-    #       'prelu',
+    'prelu',
     'relu',
     'relu6',
     'selu',
-    'sigmoid',
     'soft_relu',
     'softmax',
     'softplus',
     'softshrink',
     'softsign',
+    'sigmoid',
     'swish',
-    'tanh_shrink',
+    'tanh',
+    'tanhshrink',
     'thresholded_relu',
-    'log_softmax'
+    'log_softmax',
 ]
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
 from ...fluid import core
-from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+import paddle
+
+
+def elu(x, alpha=1.0, name=None):
+    """
+    elu activation.
+
+    .. math::
+
+        elu(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            out = F.elu(x, alpha=0.2) 
+            # [[-0.12642411  6.        ]
+            #  [ 1.          15.6      ]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.elu(x, 'alpha', alpha)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
+    helper = LayerHelper("elu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha})
+    return out
+
+
+def gelu(x, approximate=False, name=None):
+    """
+    gelu activation.
+
+    if approximate is True
+
+    .. math::
+
+        gelu(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    else
+
+    .. math::
+
+        gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        approximate (bool, optional): Wether to enable approximation. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            out1 = F.gelu(x) # [-0.158655 0.345731 0.841345 1.39979]
+            out2 = F.gelu(x, True) # [-0.158808 0.345714 0.841192 1.39957]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.gelu(x, 'approximate', approximate)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
+    helper = LayerHelper("gelu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='gelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'approximate': approximate})
+    return out
+
+
+def hardshrink(x, threshold=0.5, name=None):
+    """
+    hard shrinkage activation
+
+    .. math::
+
+        hardshrink(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x > threshold \\\\
+            &x, & & if \\ x < -threshold \\\\
+            &0, & & if \\ others
+            \\end{aligned}
+            \\right.
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            out = F.hardshrink(x) # [-1., 0., 2.5]
+
+    """
+    if in_dygraph_mode():
+        return core.ops.hard_shrink(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardshrink')
+    helper = LayerHelper('hardshrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='hard_shrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
+def hardtanh(x, min=-1.0, max=1.0, name=None):
+    """
+    hardtanh activation
+
+    .. math::
+
+        hardtanh(x)= \\begin{cases}
+                        max, \\text{if } x > max \\\\
+                        min, \\text{if } x < min \\\\
+                        x,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        min (float, optional): The minimum value of the linear region range. Default is -1.
+        max (float, optional): The maximum value of the linear region range. Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            out = F.hardtanh(x) # [-1., 0.3, 1.]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.brelu(x, 't_min', min, 't_max', max)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardtanh')
+
+    helper = LayerHelper('hardtanh', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': min,
+               't_max': max})
+    return out
 
 
 def hsigmoid(input,
@@ -126,7 +322,6 @@ def hsigmoid(input,
         Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
 
     Examples:
-
         .. code-block:: python
 
             from paddle import fluid, nn
@@ -192,192 +387,462 @@ def hsigmoid(input,
     return out
 
 
-def relu(input, inplace=False, name=None):
+def leaky_relu(x, negative_slope=0.01, name=None):
     """
-	:alias_main: paddle.nn.functional.relu
-	:alias: paddle.nn.functional.relu,paddle.nn.functional.activation.relu
+    leaky_relu activation
+
+    .. math::
+        leaky\\_relu(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x >= 0 \\\\
+            &negative\_slope * x, & & otherwise \\\\
+            \\end{aligned}
+            \\right. \\\\
 
-    ReLU Activation.
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        negative_slope (float, optional): Slope of the activation function at
+            :math:`x < 0` . Default is 0.01.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
-    .. math:
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
 
-        out = max(x, 0)
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            out = F.leaky_relu(x) # [-0.02, 0., 1.]
+
+    """
+    if in_dygraph_mode():
+        return core.ops.leaky_relu(x, 'alpha', negative_slope)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'leaky_relu')
+    helper = LayerHelper('leaky_relu', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='leaky_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': negative_slope})
+    return out
+
+
+def prelu(x, weight, name=None):
+    """
+    prelu activation.
+
+    .. math::
+
+        prelu(x) = max(0, x) + weight * min(0, x)
 
     Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float16, float32, or float64.
-        inplace (bool, optional): If inplace is True, the input and output of ``ReLU`` are the same variable.
-            Otherwise, the input and output of ``ReLU`` are different variables. Default: False. Note that if x is
-            more than one OPs' input, inplace must be False.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+        x (Tensor): The input Tensor with data type float32, float64.
+        weight (Tensor): The learnable parameter with data type same as ``x``.
+            The weight shape is [1] or [in], where `in` is the input channel of ``x``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Output of relu operator, a Tensor with shape same as input
+        A Tensor with the same data type and shape as ``x`` .
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn.functional as functional
-          import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
 
-          data = np.array([-2, 0, 1]).astype('float32')
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = functional.relu(data)  # [0, 0, 1]
+            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
+                               [ 3.0, -4.0,  5.0, -6.0],
+                               [-7.0, -8.0,  8.0,  9.0]],
+                              [[ 1.0, -2.0, -3.0,  4.0],
+                               [-5.0,  6.0,  7.0, -8.0],
+                               [ 6.0,  7.0,  8.0,  9.0]]]], 'float32')
+            x = paddle.to_tensor(data)
+            w = paddle.to_tensor(np.array([0.25]).astype('float32'))
+            out = F.prelu(x, w)
+            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
+            #    [ 3.  , -1.  ,  5.  , -1.5 ],
+            #    [-1.75, -2.  ,  8.  ,  9.  ]],
+            #   [[ 1.  , -0.5 , -0.75,  4.  ],
+            #    [-1.25,  6.  ,  7.  , -2.  ],
+            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
+    check_variable_and_dtype(weight, 'weight',
+                             ['float16', 'float32', 'float64'], 'prelu')
+
+    helper = LayerHelper('prelu', **locals())
+    assert len(weight.shape
+               ) == 1, "The dim count of weight shape should be 1 in prelu()."
+
+    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # which means x.shape[0] is batch_size and x.shape[0] is channel.
+    mode = 'all'
+    if weight.shape[0] > 1:
+        assert len(
+            x.shape
+        ) > 1, "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
+        assert weight.shape[0] == x.shape[
+            1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+        mode = 'channel'
 
     if in_dygraph_mode():
-        if inplace:
-            warnings.warn(
-                "Inplace on ReLU is not allowed and will be discarded in dygraph mode currently."
-            )
-        return core.ops.relu(input)
+        return core.ops.prelu(x, weight, 'mode', mode)
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'relu')
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x,
+                "Alpha": weight},
+        outputs={"Out": out},
+        attrs={"mode": mode})
+    return out
 
-    helper = LayerHelper('relu', **locals())
-    outs = input if inplace else helper.create_variable_for_type_inference(
-        input.dtype)
-    helper.append_op(type='relu', inputs={'X': [input]}, outputs={'Out': outs})
-    return outs
 
+def relu(x, name=None):
+    """
+    relu activation.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
 
-def sigmoid(input, inplace=False, name=None):
+            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            out = F.relu(x) # [0., 0., 1.]
     """
-	:alias_main: paddle.nn.functional.sigmoid
-	:alias: paddle.nn.functional.sigmoid,paddle.nn.functional.activation.sigmoid
 
-    Sigmoid Activation.
+    if in_dygraph_mode():
+        return core.ops.relu(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
+    helper = LayerHelper('relu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='relu', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def logsigmoid(x, name=None):
+    """
+    logsigmoid activation.
 
-    .. math:
+    .. math::
 
-        output = \frac{1}{1 + e^{-input}}
+        logsigmoid(x) = log \\frac{1}{1 + e^{-x}}
     
     Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float16, float32, or float64.
-        inplace (bool, optional): If inplace is True, the input and output are the same variable.
-            Otherwise, the input and output of are different variables. Default: False. Note that if x is
-            more than one OPs' input, inplace must be False.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
-        Output of sigmoid operator, a Tensor with shape same as input
+        A Tensor with the same data type and shape as ``x`` .
     
     Examples:
         .. code-block:: python
-          
-          import paddle.fluid as fluid
-          import paddle.nn.functional as functional
-          import numpy as np
-          # In the static graph mode
-          input = fluid.data(name="input", shape=[None, 4])
-          output = functional.sigmoid(input)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-          input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
-          output_data = exe.run(feed={"input": input_data},
-                                fetch_list=[output])
-          print(output_data) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
-          # In the dynamic graph mode
-          with fluid.dygraph.guard():
-              input = fluid.dygraph.to_variable(input_data)
-              output = functional.sigmoid(input)
-              print(output) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            out = F.logsigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.logsigmoid(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'logsigmoid')
+    helper = LayerHelper("logsigmoid", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def relu6(x, name=None):
+    """
+    relu6 activation
+
+    .. math::
+
+        relu6(x) = min(max(0,x), 6)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
+            out = F.relu6(x) # [0, 0.3, 6]
     """
+    threshold = 6.0
+    if in_dygraph_mode():
+        return core.ops.relu6(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
+    helper = LayerHelper('relu6', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
+def selu(x,
+         scale=1.0507009873554804934193349852946,
+         alpha=1.6732632423543772848170429916717,
+         name=None):
+    """
+    selu activation
+
+    .. math::
+
+        selu(x)= scale *
+                 \\begin{cases}
+                   x, \\text{if } x > 0 \\\\
+                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                 \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        scale (float, optional): The value of scale(must be greater than 1.0) for selu. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha(must be no less than zero) for selu. Default is 1.6732632423543772848170429916717
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
+            out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
+    """
+    if scale <= 1.0:
+        raise ValueError(
+            "The scale must be greater than 1.0. Received: {}.".format(scale))
+
+    if alpha < 0:
+        raise ValueError(
+            "The alpha must be no less than zero. Received: {}.".format(alpha))
 
     if in_dygraph_mode():
-        if inplace:
-            warnings.warn(
-                "Inplace on sigmoid is not allowed and will be discarded in dygraph mode currently."
-            )
-        return core.ops.sigmoid(input)
-
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'sigmoid')
-    helper = LayerHelper("sigmoid", **locals())
-    outputs = helper.create_variable_for_type_inference(input.dtype)
+        return core.ops.selu(x, 'scale', scale, 'alpha', alpha)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
+    helper = LayerHelper('selu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
-        type='sigmoid', inputs={'X': [input]}, outputs={'Out': outputs})
-    return outputs
+        type='selu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale': scale,
+               'alpha': alpha})
+    return out
 
 
-def log_softmax(input, axis=None, dtype=None, name=None):
+def softmax(x, axis=-1, dtype=None, name=None):
     """
-	:alias_main: paddle.nn.functional.log_softmax
-	:alias: paddle.nn.functional.log_softmax,paddle.nn.functional.activation.log_softmax
+    This operator implements the softmax layer. The calculation process is as follows:
 
-    This operator implements the log_softmax layer. The calculation process is as follows:
+    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
+
+    2. Then ``x`` will be logically flattened to a 2-D matrix. The matrix's second
+    dimension(row length) is the same as the dimension :attr:`axis` of ``x``,
+    and the first dimension(column length) is the product of all other dimensions
+    of ``x``. For each row of the matrix, the softmax operator squashes the
+    K-dimensional(K is the width of the matrix, which is also the size of ``x``'s
+    dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional
+    vector of real values in the range [0, 1] that add up to 1.
+
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
+    are performed to restore the two-dimensional matrix to the same dimension as the ``x`` .
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+        softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+
+    Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+
+          Attrs:
+            axis = -1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+
+        Case 2:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+          Attrs:
+            axis = 1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
+                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
+                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
+                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
+                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
 
     Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float32, or float64.
-        axis (int, optional): The index of dimension to perform softmax calculations, it should be in
-            range :math:`[-1, rank-1]`, while :math:`rank` is the rank of input variable. Default: None. 
-            None and -1 means the last dimension.
-        dtype (np.dtype|core.VarDesc.VarType|str): The desired data type of returned tensor. If specified,
-            the input tensor is casted to dtype before the operation is performed. This is useful for
-            preventing data type overflows. Default: None. Supported dtype: float32 or float64
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
- 
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Variable: ``Tensor`` indicates the output of softmax. The data type and shape are the same as ``input``.
+        A Tensor with the same shape and data type (use ``dtype`` if it is
+        specified) as x.
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn.functional as F
-          import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-          data = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                           [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]]).astype('float32')
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = F.log_softmax(data, -1)
-              # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-              #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-              #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-              #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-              #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-              #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+            paddle.disable_static()
+
+            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                        [3.0, 4.0, 5.0, 6.0],
+                        [7.0, 8.0, 8.0, 9.0]],
+                        [[1.0, 2.0, 3.0, 4.0],
+                        [5.0, 6.0, 7.0, 8.0],
+                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = F.softmax(x)
+            out2 = F.softmax(x, dtype='float64')
+            # out1's data type is float32; out2's data type is float64
+            # out1 and out2's value is as follows:
+            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
     """
 
-    axis = -1 if axis is None else axis
-    dtype = convert_np_dtype_to_dtype_(dtype) if dtype is not None else dtype
+    if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    use_cudnn = True if axis is -1 else False
 
     if in_dygraph_mode():
-        outs_cast = input if dtype is None \
-            else core.ops.cast(input, 'in_dtype', input.dtype, 'out_dtype', dtype)
-        outs_softmax = core.ops.softmax(outs_cast, 'axis', axis, 'use_cudnn',
-                                        False)
-        return core.ops.log(outs_softmax)
+        outs_cast = x if dtype is None \
+            else core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return core.ops.softmax(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
 
     if dtype is None:
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], 'log_softmax')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'softmax')
+    else:
+        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'softmax',
+                    'If dtype is not None, it only support float32 or float64.')
 
-    helper = LayerHelper("log_softmax", **locals())
-    outs_cast = input
+    helper = LayerHelper("softmax", **locals())
+    outs_cast = x
     if dtype is not None:
         outs_cast = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type='cast',
-            inputs={'X': input},
+            inputs={'X': x},
             outputs={'Out': outs_cast},
-            attrs={'in_dtype': input.dtype,
+            attrs={'in_dtype': x.dtype,
                    'out_dtype': dtype})
 
     outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
@@ -386,10 +851,277 @@ def log_softmax(input, axis=None, dtype=None, name=None):
         inputs={'X': outs_cast},
         outputs={'Out': outs_softmax},
         attrs={'axis': axis,
-               'use_cudnn': False})
+               'use_cudnn': use_cudnn})
+
+    return outs_softmax
+
+
+def softplus(x, beta=1, threshold=20, name=None):
+    """
+    softplus activation
+
+    .. math::
+
+        softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        beta (float, optional): The value of beta for softplus. Default is 1
+        threshold (float, optional): The value of threshold for softplus. Default is 20
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
 
-    outs_log = helper.create_variable_for_type_inference(outs_softmax.dtype)
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+    """
+    if in_dygraph_mode():
+        return core.ops.softplus(x, 'beta', beta, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softplus')
+    helper = LayerHelper('softplus', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
-        type='log', inputs={'X': outs_softmax}, outputs={'Out': outs_log})
+        type='softplus',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': beta,
+               'threshold': threshold})
+    return out
+
+
+def softshrink(x, threshold=0.5, name=None):
+    """
+    softshrink activation
 
-    return outs_log
+    .. math::
+
+        softshrink(x)= \\begin{cases}
+                        x - threshold, \\text{if } x > threshold \\\\
+                        x + threshold, \\text{if } x < -threshold \\\\
+                        0,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
+    """
+    if threshold < 0:
+        raise ValueError(
+            "The threshold must be no less than zero. Received: {}.".format(
+                threshold))
+
+    if in_dygraph_mode():
+        return core.ops.softshrink(x, 'lambda', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softshrink')
+    helper = LayerHelper('softshrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='softshrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'lambda': threshold})
+    return out
+
+
+def softsign(x, name=None):
+    """
+    softsign activation
+
+    .. math::
+
+        softsign(x) = \\frac{x}{1 + |x|}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+    """
+    if in_dygraph_mode():
+        return core.ops.softsign(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softsign')
+    helper = LayerHelper('softsign', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='softsign', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def tanhshrink(x, name=None):
+    """
+    tanhshrink activation
+
+    .. math::
+
+        tanhshrink(x) = x - tanh(x)
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+    """
+    if in_dygraph_mode():
+        return core.ops.tanh_shrink(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'tanhshrink')
+    helper = LayerHelper('tanh_shrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='tanh_shrink', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def log_softmax(x, axis=-1, dtype=None, name=None):
+    """
+    This operator implements the log_softmax layer. The calculation process is
+    as follows:
+
+    .. math::
+
+        log\\_softmax[i, j] = log(softmax(x))
+                            = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+ 
+    Returns:
+        A Tensor with the same shape and data type (use ``dtype`` if it is
+        specified) as x.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                            [3.0, -4.0, 5.0, -6.0],
+                            [-7.0, -8.0, 8.0, 9.0]],
+                            [[1.0, -2.0, -3.0, 4.0],
+                            [-5.0, 6.0, 7.0, -8.0],
+                            [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = F.log_softmax(x)
+            out2 = F.log_softmax(x, dtype='float64')
+            # out1's data type is float32; out2's data type is float64
+            # out1 and out2's value is as follows:
+            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+    """
+
+    if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        if dtype is not None:
+            x = core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return core.ops.log_softmax(x, 'axis', axis)
+
+    if dtype is None:
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'log_softmax')
+    else:
+        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'log_softmax',
+                    'If dtype is not None, it only support float32 or float64.')
+
+    helper = LayerHelper("log_softmax", **locals())
+    out_cast = x
+    if dtype is not None:
+        out_cast = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': out_cast},
+            attrs={'in_dtype': x.dtype,
+                   'out_dtype': dtype})
+
+    out = helper.create_variable_for_type_inference(out_cast.dtype)
+    helper.append_op(
+        type='log_softmax',
+        inputs={'X': out_cast},
+        outputs={'Out': out},
+        attrs={'axis': axis})
+
+    return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fe41cb6e64c34f34add3c0652ab5b30efe958161..623af3277fba0e29fb77b02c711e258602f1f75a 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -13,26 +13,45 @@
 # limitations under the License.
 
 import warnings
+import paddle
+from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers.tensor import Variable, fill_constant
-
+from paddle.fluid.layers.tensor import Variable, fill_constant, zeros, concat
+from ...fluid.layers import core
+from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-from ...fluid.layers import dropout  #DEFINE_ALIAS
 from ...fluid.layers import label_smooth  #DEFINE_ALIAS
 from ...fluid import one_hot  #DEFINE_ALIAS
-from ...fluid.layers import pad  #DEFINE_ALIAS
 from ...fluid.layers import pad2d  #DEFINE_ALIAS
 from ...fluid.layers import unfold  #DEFINE_ALIAS
 from ...fluid.layers import assign  #DEFINE_ALIAS
+from ...fluid.layers import squeeze  #DEFINE_ALIAS
+from ...fluid.layers import unsqueeze  #DEFINE_ALIAS
+from ...fluid.layers import elementwise_mul  #DEFINE_ALIAS
+from ...tensor import clip
+from ...tensor import sum
+from ...tensor import sqrt
+from ...tensor import sum  #DEFINE_ALIAS
+from ...tensor import sqrt  #DEFINE_ALIAS
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
 
 #from ...fluid.layers import fc  #DEFINE_ALIAS
 from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
+from ...fluid.framework import in_dygraph_mode
+from ...fluid import core, dygraph_utils
+from ...fluid import core, layers
+from ...fluid.data_feeder import check_variable_and_dtype
 
 __all__ = [
     'dropout',
+    'dropout2d',
+    'dropout3d',
+    'alpha_dropout',
     #       'embedding',
     #       'fc',
     'label_smooth',
+    'linear',
     'one_hot',
     'pad',
     'pad_constant_like',
@@ -40,29 +59,29 @@ __all__ = [
     'unfold',
     #       'bilinear_tensor_product',
     'assign',
-    'interpolate'
+    'interpolate',
+    'upsample',
+    'bilinear',
+    'cosine_similarity',
 ]
 
 
-def interpolate(input,
+def interpolate(x,
                 size=None,
                 scale_factor=None,
                 mode='nearest',
                 align_corners=False,
-                align_mode=1,
+                align_mode=0,
                 data_format='NCHW',
                 name=None):
     """
-	:alias_main: paddle.nn.functional.interpolate
-	:alias: paddle.nn.functional.interpolate,paddle.nn.functional.common.interpolate
 
     This op resizes a batch of images.
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
     and the resizing only applies on the three dimensions(depth, height and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
+
     Supporting resample methods:
         'linear' : Linear interpolation
         'bilinear' : Bilinear interpolation
@@ -87,7 +106,7 @@ def interpolate(input,
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method
+    align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
     Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -117,18 +136,12 @@ def interpolate(input,
                 W_out = W_{in} * scale_{factor}
         
         Nearest neighbor interpolation:
-          if:
+
               align_corners = False
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
               H_out = floor (H_{in} * scale_{factor})
               W_out = floor (W_{in} * scale_{factor})
-          else:
-              align_corners = True
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
 
         Bilinear interpolation:
           if:
@@ -187,22 +200,22 @@ def interpolate(input,
     https://en.wikipedia.org/wiki/Bicubic_interpolation
     
     Parameters:
-        input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Variable|None): Output shape of image resize
+        size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Variable|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
-                               corner pixels.
+                               corner pixels.This only has an effect when 'linear', 'bilinear', 'bicubic' or 'trilinear'.
                                Default: False
         align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
                             it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
@@ -220,7 +233,7 @@ def interpolate(input,
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
     Raises:
-        TypeError: size should be a list or tuple or Variable.
+        TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
@@ -238,53 +251,27 @@ def interpolate(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
 	    import paddle
 	    import numpy as np
-	    input = fluid.data(name="input", shape=[None,3,6,10])
-	    #1
-	    output = paddle.nn.functional.interpolate(input=input, size=[12,12])
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = paddle.nn.functional.interpolate(input=input, size=[12,dim1])
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = paddle.nn.functional.interpolate(input=input, size=shape_tensor)
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = paddle.nn.functional.interpolate(input=input, scale_factor=scale_tensor)
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
-	    output_data = exe.run(fluid.default_main_program(),
-                feed={"input":input_data},
-                fetch_list=[output],
-                return_numpy=True)
-
-	    print(output_data[0].shape)
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = paddle.nn.functional.interpolate(input=input, size=[12,12])
-    		print(output.shape)
-		# [2L, 3L, 12L, 12L]
+            import paddle.nn.functional as F
+            paddle.disable_static()
+            
+            # given out size
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            x = paddle.to_tensor(input_data)
+            output_1 = F.interpolate(x=x, size=[12,12])
+    	    print(output_1.shape)
+	    # [2L, 3L, 12L, 12L]
+            
+            # given scale
+            output_2 = F.interpolate(x=x, scale_factor=[2,1])
+            print(output_2.shape)
+            # [2L, 3L, 12L, 10L]
+            
+            # bilinear interp
+            output_3 = F.interpolate(x=x, scale_factor=[2,1], mode="bilinear")
+            print(output_2.shape)
+            # [2L, 3L, 12L, 10L]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -302,13 +289,13 @@ def interpolate(input,
             "The 'resample' of image_resize can only be 'linaer', 'bilinear', 'trilinear', "
             " 'bicubic' or 'nearest' currently.")
 
-    if resample in ['LINEAR'] and len(input.shape) != 3:
+    if resample in ['LINEAR'] and len(x.shape) != 3:
         raise ValueError("'linear' only support 3-D tensor.")
 
-    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(input.shape) != 4:
+    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(x.shape) != 4:
         raise ValueError(
             "'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.")
-    if resample == 'TRILINEAR' and len(input.shape) != 5:
+    if resample == 'TRILINEAR' and len(x.shape) != 5:
         raise ValueError("'trilinear'only support 5-D tensor.")
 
     if size is None and scale_factor is None:
@@ -319,19 +306,21 @@ def interpolate(input,
 
     if align_mode != 0 and align_mode != 1:
         raise ValueError("align_mode can only be 0 or 1")
-
-    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
+    if align_corners != 0 and resample == 'NEAREST':
+        raise ValueError(
+            "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
+        )
+    helper = LayerHelper('{}_interp_v2'.format(resample_type), **locals())
     dtype = helper.input_dtype()
-
-    if len(input.shape) == 3 and data_format not in ['NCW', 'NWC']:
+    if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCW` or `NWC` supported for 3-D input.")
-    elif len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
+    elif len(x.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCHW` or `NHWC` supported for 4-D input.")
-    elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
+    elif len(x.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
@@ -344,7 +333,10 @@ def interpolate(input,
     if data_format == 'NHWC' or data_format == 'NDHWC' or data_format == 'NWC':
         data_layout = 'NHWC'
 
-    inputs = {"X": input}
+    if resample == 'NEAREST':
+        align_corners = False
+
+    inputs = {"X": x}
     attrs = {
         "out_d": -1,
         "out_h": -1,
@@ -393,7 +385,7 @@ def interpolate(input,
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
 
-            if len(input.shape) == 3:
+            if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
                         "out_shape length should be 2 for input 3-D tensor")
@@ -402,7 +394,7 @@ def interpolate(input,
                 else:
                     out_shape = list(map(int, out_shape))
                     attrs['out_w'] = out_shape[0]
-            if len(input.shape) == 4:
+            if len(x.shape) == 4:
                 if len(out_shape) != 2:
                     raise ValueError("out_shape length should be 2 for "
                                      "input 4-D tensor.")
@@ -413,7 +405,7 @@ def interpolate(input,
                     out_shape = list(map(int, out_shape))
                     attrs['out_h'] = out_shape[0]
                     attrs['out_w'] = out_shape[1]
-            if len(input.shape) == 5:
+            if len(x.shape) == 5:
                 if len(out_shape) != 3:
                     raise ValueError("out_shape length should be 3 for "
                                      "input 5-D tensor.")
@@ -434,15 +426,1023 @@ def interpolate(input,
         elif isinstance(scale, float) or isinstance(scale, int):
             if scale <= 0:
                 raise ValueError("Attr(scale) should be greater than zero.")
-            attrs['scale'] = float(scale)
+            scale_list = []
+            for i in range(len(x.shape) - 2):
+                scale_list.append(scale)
+            attrs['scale'] = list(map(float, scale_list))
+        elif isinstance(scale, list):
+            if len(scale) != len(x.shape) - 2:
+                raise ValueError("scale_shape length should be {} for "
+                                 "input {}-D tensor.".format(
+                                     len(x.shape) - 2, len(x.shape)))
+            for value in scale:
+                if value <= 0:
+                    raise ValueError("Attr(scale) should be greater than zero.")
+            attrs['scale'] = list(map(float, scale))
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int or Variable.")
+                "Attr(scale)'s type should be float, int, list or Tensor.")
+
+    if in_dygraph_mode():
+        attr_list = []
+        for k, v in attrs.items():
+            attr_list.append(k)
+            attr_list.append(v)
+        dy_attr = tuple(attr_list)
 
+        if resample_type == "linear":
+            out = core.ops.linear_interp_v2(x, *dy_attr)
+        if resample_type == "bilinear":
+            out = core.ops.bilinear_interp_v2(x, *dy_attr)
+        if resample_type == "trilinear":
+            out = core.ops.trilinear_interp_v2(x, *dy_attr)
+        if resample_type == "nearest":
+            out = core.ops.nearest_interp_v2(x, *dy_attr)
+        if resample_type == "bicubic":
+            out = core.ops.bicubic_interp_v2(x, *dy_attr)
+        return out
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
-        type='{}_interp'.format(resample_type),
+        type='{}_interp_v2'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
         attrs=attrs)
     return out
+
+
+def upsample(x,
+             size=None,
+             scale_factor=None,
+             mode='nearest',
+             align_corners=False,
+             align_mode=0,
+             data_format='NCHW',
+             name=None):
+    """
+    This op resizes a batch of images.
+    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
+    or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
+    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    and the resizing only applies on the three dimensions(depth, height and width).
+
+    Supporting resample methods:
+        'linear' : Linear interpolation
+        'bilinear' : Bilinear interpolation
+        'trilinear' : Trilinear interpolation
+        'nearest' : Nearest neighbor interpolation
+        'bicubic' : Bicubic interpolation
+    Linear interpolation is the method of using a line connecting two known quantities 
+    to determine the value of an unknown quantity between the two known quantities. 
+    
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
+    direction) on input tensor.
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
+    again in the other direction.
+    
+    Bicubic interpolation is an extension of cubic interpolation for interpolating
+    data points on a two-dimensional regular grid. The interpolated surface is
+    smoother than corresponding surfaces obtained by bilinear interpolation or
+    nearest-neighbor interpolation.
+    Trilinear interpolation is an extension of linear interpolation for
+    interpolating functions of three variables (e.g. D-direction,
+    H-direction and W-direction in this op) on a rectilinear 3D grid.
+    The linear interpolation is performed on three directions.
+    align_corners and align_mode are optional parameters,the calculation method
+    of interpolation can be selected by them.
+    Example:
+    .. code-block:: text
+        For scale_factor:
+            if align_corners = True && out_size > 1 :
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            else:
+              scale_factor = float(in_size/out_size)
+        Linear interpolation:
+            if:
+                align_corners = False , align_mode = 0
+                input : (N,C,W_in)
+                output: (N,C,W_out) where:
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,W_in)
+                output: (N,C,W_out) where:
+                W_out = W_{in} * scale_{factor}
+        Nearest neighbor interpolation:
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = floor (H_{in} * scale_{factor})
+              W_out = floor (W_{in} * scale_{factor})
+          else:
+              align_corners = True
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+        
+        Bilinear interpolation:
+          if:
+              align_corners = False , align_mode = 0
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+        Bicubic interpolation:
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+        Trilinear interpolation:
+          if:
+              align_corners = False , align_mode = 0
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              D_out = D_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+    https://en.wikipedia.org/wiki/Linear_interpolation.
+    For details of linear interpolation, please refer to Wikipedia:
+    
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+    
+    For details of bilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+    
+    For details of bicubic interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bicubic_interpolation
+    
+    For details of trilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Trilinear_interpolation.
+    
+    Parameters:
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None.
+        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+                       'bicubic' and 'trilinear' currently. Default: 'nearest'
+        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
+                               input and output tensors are aligned, preserving the values at the
+                               corner pixels.
+                               Default: False
+        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
+                            it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
+                            src_idx = scale_factor*dst_index.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+        or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
+                    'trilinear', 'bicubic', or 'nearest' currently.
+        ValueError: 'linear' only support 3-D tensor.
+        ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        ValueError: 'trilinear' only support 5-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 1 for input 3-D tensor.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: size length should be 3 for input 5-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        TypeError: align_corners should be a bool value
+        ValueError: align_mode can only be '0' or '1'
+        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
+        Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            import paddle.nn.functional as F
+            paddle.disable_static()
+
+            input = paddle.to_tensor(input_data)
+            output = F.upsample(input=input, size=[12,12])
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
+    """
+    return interpolate(x, size, scale_factor, mode, align_corners, align_mode,
+                       data_format)
+
+
+def bilinear(x1, x2, weight, bias=None, name=None):
+    """
+
+    This layer performs bilinear on two inputs.
+    See :ref:`api_nn_Bilinear` for details and output shape.
+
+    Parameters:
+       x1 (Tensor): the first input tensor, it's data type should be float32, float64.
+       x2 (Tensor): the second input tensor, it's data type should be float32, float64.
+       weight (Parameter): The learnable weights of this layer, shape is [out_features, in1_features, in2_features].
+       bias (Parameter, optional): The learnable bias(Bias) of this layer, shape is [1, out_features]. If it is set to None, no bias will be added to the output units. The default value is None.
+       name (str, optional): The default value is None. Normally there is no need for user
+           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+       Tensor: A 2-D Tensor of shape [batch_size, out_features].
+
+    Examples:
+       .. code-block:: python
+
+        import paddle
+        import numpy
+        import paddle.nn.functional as F
+
+        paddle.disable_static()
+        x1 = numpy.random.random((5, 5)).astype('float32')
+        x2 = numpy.random.random((5, 4)).astype('float32')
+        w = numpy.random.random((1000, 5, 4)).astype('float32')
+        b = numpy.random.random((1, 1000)).astype('float32')
+
+        result = F.bilinear(paddle.to_tensor(x1), paddle.to_tensor(x2), paddle.to_tensor(w), paddle.to_tensor(b))           # result shape [5, 1000]
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.bilinear_tensor_product(x1, x2, weight, bias)
+
+    check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
+    check_variable_and_dtype(x2, 'x2', ['float32', 'float64'], 'bilinear')
+
+    inputs = {"X": x1, "Y": x2, "Weight": weight}
+    if bias is not None:
+        inputs["Bias"] = bias
+
+    helper = LayerHelper("bilinear", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x1.dtype)
+
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+
+    return out
+
+
+def dropout(x,
+            p=0.5,
+            axis=None,
+            training=True,
+            mode="upscale_in_train",
+            name=None):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training. The dropout operator randomly sets the
+    outputs of some units to zero, while upscale others according to the given
+    dropout probability.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float | int): Probability of setting units to zero. Default 0.5.
+        axis (int | list): The axis along which the dropout is performed. Default None.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                           1. upscale_in_train(default), upscale the output at training time
+
+                              - train: out = input * mask / ( 1.0 - dropout_prob )
+                              - inference: out = input
+
+                           2. downscale_in_infer, downscale the output at inference
+
+                              - train: out = input * mask
+                              - inference: out = input * (1.0 - dropout_prob)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x` .
+
+    Examples:
+        We use ``p=0.5`` in the following description for simplicity.
+        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
+            Let's see a simple case when x is a 2d tensor with shape 2*3:
+            [[1 2 3]
+             [4 5 6]]
+            we generate mask with the same shape as x, which is 2*3. The value of mask is
+            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
+            [[0 1 0]
+             [1 0 1]]
+            So the output is obtained from elementwise multiply of x and mask:
+            [[0 2 0]
+             [4 0 6]]
+            Using default setting, i.e. ``mode='upscale_in_train'`` ,
+            if in training phase, the final upscale output is:
+            [[0 4 0 ]
+             [8 0 12]]
+            if in test phase, the output is the same as input:
+            [[1 2 3]
+             [4 5 6]]
+            we can also set ``mode='downscale_in_infer'`` , then
+            if in training phase, the final output is:
+            [[0 2 0]
+             [4 0 6]]
+            if in test phase, the scale output is:
+            [[0.5 1.  1.5]
+             [2.  2.5 3. ]]
+
+        2. When ``axis!=None`` , this is useful for dropping whole channels from an image or sequence.
+            Let's see the simple case when x is a 2d tensor with shape 2*3 again:
+            [[1 2 3]
+             [4 5 6]]
+            (1) If ``axis=0`` , this means the dropout is only performed in axis `0` .
+                we generate mask with the shape 2*1. Only in axis `0` the value is randomly selected.
+                For example, we may get such mask:
+                [[1]
+                 [0]]
+                The output is obtained from elementwise multiply of x and mask. Doing that the mask will be
+                broadcast from 2*1 to 2*3:
+                [[1 1 1]
+                 [0 0 0]]
+                and the result after elementwise multiply is:
+                [[1 2 3]
+                 [0 0 0]]
+                then we can do upscale or downscale according to the setting of other arguments.
+            (2) If ``axis=1`` , this means the dropout is only performed in axis `1` .
+                we generate mask with the shape 1*3. Only in axis `1` the value is randomly selected.
+                For example, we may get such mask:
+                [[1 0 1]]
+                Doing elementwise multiply the mask will be broadcast from 1*3 to 2*3:
+                [[1 0 1]
+                 [1 0 1]]
+                and the result after elementwise multiply is:
+                [[1 0 3]
+                 [4 0 6]]
+            (3) What about ``axis=[0, 1]`` ? This means the dropout is performed in all axes of x,
+                which is the same case as default setting ``axis=None`` .
+            (4) You may note that logically `axis=None` means the dropout is performed in none axis of x,
+                We generate mask with the shape 1*1. Whole input is randomly selected or dropped.
+                For example, we may get such mask:
+                [[0]]
+                Doing elementwise multiply the mask will be broadcast from 1*1 to 2*3:
+                [[0 0 0]
+                 [0 0 0]]
+                and the result after elementwise multiply is:
+                [[0 0 0]
+                 [0 0 0]]
+                Actually this is not what we want because all elements may set to zero~
+            When x is a 4d tensor with shape `NCHW`, we can set ``axis=[0,1]`` and the dropout will be performed
+            in channel `N` and `C`, `H` and `W` is tied, i.e.
+            paddle.nn.dropout(x, p, axis=[0,1])
+            Please refer to ``paddle.nn.functional.dropout2d`` for more details.
+            Similarly, when x is a 5d tensor with shape `NCDHW`, we can set ``axis=[0,1]`` to perform
+            dropout3d. Please refer to ``paddle.nn.functional.dropout3d`` for more details.
+
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout(x, 0.5)
+            y_test = paddle.nn.functional.dropout(x, 0.5, training=False) 
+            y_0 = paddle.nn.functional.dropout(x, axis=0)
+            y_1 = paddle.nn.functional.dropout(x, axis=1)
+            y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+            print(y_0.numpy())
+            print(y_1.numpy())
+            print(y_01.numpy())
+
+    """
+    if not isinstance(p, (float, int)):
+        raise TypeError("p argument should be a number")
+    if p < 0 or p > 1:
+        raise ValueError("p argument should between 0 and 1")
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    if axis and not isinstance(axis, (int, list)):
+        raise TypeError("datatype of axis argument should be int or list")
+
+    if axis == None:  # commonly used dropout
+        seed = None
+        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+        def get_attrs(prog, dropout_prob, is_test, seed):
+            if (seed is None or seed == 0) and prog.random_seed != 0:
+                seed = prog.random_seed
+            attrs = {
+                'dropout_prob': dropout_prob,
+                'is_test': is_test,
+                'fix_seed': seed is not None,
+                'seed': seed if seed is not None else 0,
+                'dropout_implementation': mode,
+            }
+            return attrs
+
+        if in_dygraph_mode():
+            if default_main_program().random_seed != 0:
+                seed = default_main_program().random_seed
+            out, mask = core.ops.dropout(
+                x, 'dropout_prob', p, 'is_test', not training, 'fix_seed',
+                seed is not None, 'seed', seed
+                if seed is not None else 0, 'dropout_implementation', mode)
+            return out
+
+        helper = LayerHelper('dropout', **locals())
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'dropout')
+
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        mask = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+        attrs = get_attrs(helper.main_program, p, not training, seed)
+
+        helper.append_op(
+            type='dropout',
+            inputs={'X': [x]},
+            outputs={'Out': [out],
+                     'Mask': [mask]},
+            attrs=attrs)
+        return out
+    else:  #sometimes called dropout_nd #TODO: optimize with c++
+        if not in_dygraph_mode():
+            check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
+        dtype = x.dtype
+        keep_prob = 1 - p
+        if training:
+            if p == 1.:
+                return layers.scale(x, scale=0.)
+
+            scale_input = layers.scale(
+                x, scale=1 / keep_prob) if mode == 'upscale_in_train' else x
+
+            #get mask shape
+            input_shape = x.shape
+            drop_axes = [axis] if isinstance(axis, int) else axis
+            if max(drop_axes) > len(input_shape) - 1:
+                raise ValueError("axis value should less than dimensions of x:{}, but get drop_axes value:{} " \
+                                 .format(len(input_shape), max(drop_axes)))
+            if len(drop_axes) > len(input_shape):
+                raise ValueError(
+                    "length of axis should not greater than dimensions of x:{}, but get length of drop axes: {}".
+                    format(len(input_shape), len(drop_axes)))
+            mask_shape = [1] * len(input_shape)
+            for i in drop_axes:
+                mask_shape[i] = input_shape[i]
+
+            #get mask
+            random_tensor = layers.uniform_random(
+                mask_shape, dtype='float32', min=0., max=1.0)
+            p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+            keep_mask = layers.greater_equal(random_tensor, p)
+
+            scale_input = layers.cast(scale_input, dtype)
+            keep_mask = layers.cast(keep_mask, dtype)
+            ret = paddle.multiply(scale_input, keep_mask, name=name)
+            return ret
+        else:  # test
+            ret = layers.scale(
+                x, scale=keep_prob) if mode == 'downscale_in_infer' else x
+            return ret
+
+
+def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
+    """
+    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
+    a channel is a 2D feature map with the shape `HW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    Args:
+        x (Tensor):  The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C].
+                     The data type is float32 or float64.
+        p (float): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCHW` , `NHWC` . The default is `NCHW` . When it is `NCHW` , the data is
+                                    stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout2d, has same shape and data type as `x` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout2d(x)  #train
+            y_test = paddle.nn.functional.dropout2d(x, training=False) #test
+            for i in range(2):
+                for j in range(3):
+                    print(x.numpy()[i,j,:,:])
+                    print(y_train.numpy()[i,j,:,:]) # may all 0
+                    print(y_test.numpy()[i,j,:,:])
+    """
+    input_shape = x.shape
+    if len(input_shape) != 4:
+        raise ValueError("dimensions of x should be 4, but received {} != 4"\
+        .format(len(input_shape)))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCHW' else [0, 3],
+        training=training,
+        mode="upscale_in_train",
+        name=name)
+
+
+def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
+    """
+    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
+    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    Args:
+        x (Tensor):  The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C].
+                     The data type is float32 or float64.
+        p (float): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    ``NCDHW``, ``NDHWC``. The default is ``NCDHW`` . When it is ``NCDHW`` , the data is
+                                    stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout3d, has same shape and data type with `x` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout3d(x)  #train
+            y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+            print(x.numpy()[0,0,:,:,:])
+            print(y_train.numpy()[0,0,:,:,:]) # may all 0
+            print(y_test.numpy()[0,0,:,:,:])
+    """
+
+    input_shape = x.shape
+    if len(input_shape) != 5:
+        raise ValueError("dimensions of x should be 5, but received {} != 5" \
+        .format(len(input_shape)))
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
+        training=training,
+        mode="upscale_in_train",
+        name=name)
+
+
+def alpha_dropout(x, p=0.5, training=True, name=None):
+    """
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing property.
+    For an input with zero mean and unit standard deviation, the output of Alpha Dropout
+    maintains the original mean and standard deviation of the input.
+    Alpha Dropout fits well to SELU activate function by randomly setting activations to the negative saturation value.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float | int): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x`.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+            y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+            print(x.numpy())
+            print(y_train.numpy())
+            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
+            print(y_test.numpy())
+    """
+    if not isinstance(p, (float, int)):
+        raise TypeError("p argument should be a float or int")
+    if p < 0 or p > 1:
+        raise ValueError("p argument should between 0 and 1")
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'alpha_dropout')
+
+    if training:
+        #get transformation params
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+        alpha_p = -alpha * scale
+        a = ((1 - p) * (1 + p * alpha_p**2))**-0.5
+        b = -a * alpha_p * p
+
+        dtype = x.dtype
+        input_shape = x.shape
+
+        #get mask
+        random_tensor = layers.uniform_random(
+            input_shape, dtype='float32', min=0., max=1.0)
+        p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+        keep_mask = layers.greater_equal(random_tensor, p)
+        keep_mask = layers.cast(keep_mask, dtype)
+        drop_mask = layers.elementwise_sub(
+            layers.fill_constant(
+                shape=input_shape, dtype=dtype, value=1.),
+            keep_mask)
+
+        #apply mask
+        b = layers.fill_constant(shape=[1], dtype=dtype, value=b)
+        y = layers.elementwise_add(
+            paddle.multiply(x, keep_mask),
+            layers.scale(
+                drop_mask, scale=alpha_p))
+        res = layers.elementwise_add(layers.scale(y, scale=a), b, name=name)
+        return res
+    else:  # test
+        return x
+
+
+def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
+    """
+    Pad tensor according to 'pad' and 'mode'.
+    If mode is 'reflect', pad[0] and pad[1] must be no greater
+    than width-1. The height and depth dimension has the same condition.
+
+    Parameters:
+        x (Tensor): The input tensor with data type float32/double/int32/int64_t.
+        pad (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. 1. If input dimension is 3, then the pad has the form (pad_left,
+            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, 
+            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form 
+            (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+            
+        mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'.
+            When in 'constant' mode, this op uses a constant value to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'constant'
+        value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
+        data_format (str): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of
+           the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+                    
+    Returns: a Tensor padded according to pad and mode and data type is same as input.
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+
+            Case 0:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'constant'
+                value = 0
+                Out = [[[[[0. 0. 0. 0. 0. 0. 0.]
+                          [0. 0. 1. 2. 3. 0. 0.]
+                          [0. 0. 4. 5. 6. 0. 0.]
+                          [0. 0. 0. 0. 0. 0. 0.]]]]]
+
+            Case 1:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'reflect'
+                Out = [[[[[6. 5. 4. 5. 6. 5. 4.]
+                          [3. 2. 1. 2. 3. 2. 1.]
+                          [6. 5. 4. 5. 6. 5. 4.]
+                          [3. 2. 1. 2. 3. 2. 1.]]]]]
+
+            Case 2:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'replicate'
+                Out = [[[[[1. 1. 1. 2. 3. 3. 3.]
+                          [1. 1. 1. 2. 3. 3. 3.]
+                          [4. 4. 4. 5. 6. 6. 6.]
+                          [4. 4. 4. 5. 6. 6. 6.]]]]]
+
+            Case 3:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'circular'
+                Out = [[[[[5. 6. 4. 5. 6. 4. 5.]
+                          [2. 3. 1. 2. 3. 1. 2.]
+                          [5. 6. 4. 5. 6. 4. 5.]
+                          [2. 3. 1. 2. 3. 1. 2.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            
+            paddle.disable_static()
+            
+            # example 1
+            x_shape = (1, 1, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad(tensor_x, pad=[2, 3], value=1, mode='constant')
+            print(y.numpy())
+            # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
+            
+            # example 2
+            x_shape = (1, 1, 2, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad(tensor_x, pad=[1, 2, 1, 1], value=1, mode='circular')
+            print(y.numpy())
+            # [[[[6. 4. 5. 6. 4. 5.]
+            #    [3. 1. 2. 3. 1. 2.]
+            #    [6. 4. 5. 6. 4. 5.]
+            #    [3. 1. 2. 3. 1. 2.]]]]
+    """
+    assert mode in ['reflect', 'replicate', 'constant', 'circular'], \
+            "mode should be one of constant, reflect, replicate, circular, but got {}.".format(mode)
+
+    data_format = data_format.upper()
+    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], \
+        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
+        "but got {}".format(data_format)
+
+    x_dim = len(x.shape)
+
+    assert x_dim in [
+        3, 4, 5
+    ], "input tesor dimension must be in [3, 4, 5] but got {}".format(x_dim)
+
+    supported_format_map = {
+        3: ["NCL", "NLC"],
+        4: ["NCHW", "NHWC"],
+        5: ["NCDHW", "NDHWC"],
+    }
+    assert data_format in supported_format_map[x_dim], \
+    "input tensor dimension is {}, it's data format should be in {} but got {}".format(
+        x_dim, supported_format_map[x_dim], data_format)
+
+    unsqueezed_dim = []
+
+    if isinstance(pad, Variable):
+        if data_format in ["NCL", "NCHW", "NCDHW"]:
+            data_format = "NCDHW"
+            if x_dim == 3:
+                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                unsqueezed_dim = [3, 4]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                unsqueezed_dim = [2]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+        elif data_format in ["NLC", "NHWC", "NDHWC"]:
+            data_format = "NDHWC"
+            if x_dim == 3:
+                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                unsqueezed_dim = [2, 3]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                unsqueezed_dim = [1]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+    else:
+        if data_format in ["NCL", "NCHW", "NCDHW"]:
+            data_format = "NCDHW"
+            if x_dim == 3:
+                pad = [0, 0, 0, 0] + pad
+                unsqueezed_dim = [3, 4]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = pad + [0, 0]
+                unsqueezed_dim = [2]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+        elif data_format in ["NLC", "NHWC", "NDHWC"]:
+            data_format = "NDHWC"
+            if x_dim == 3:
+                pad = [0, 0, 0, 0] + pad
+                unsqueezed_dim = [2, 3]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = pad + [0, 0]
+                unsqueezed_dim = [1]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+
+    if in_dygraph_mode():
+        if isinstance(pad, Variable):
+            pad = pad.numpy()
+        out = core.ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
+                             "data_format", data_format, "name", name)
+    else:
+        attrs = {'mode': mode, 'value': value, 'data_format': data_format}
+        inputs = {'X': [x]}
+        if isinstance(pad, Variable):
+            inputs['Paddings'] = [pad]
+            attrs['paddings'] = []
+        else:
+            attrs['paddings'] = pad
+
+        helper = LayerHelper('pad3d', **locals())
+
+        dtype = helper.input_dtype(input_param_name='input')
+        out = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
+
+    if len(unsqueezed_dim) != 0:
+        out = squeeze(out, axes=unsqueezed_dim)
+
+    return out
+
+
+def cosine_similarity(x1, x2, axis=1, eps=1e-8):
+    """
+    Compute cosine similarity between x1 and x2 along axis.
+
+    Parameters:
+        x1 (Tensor): First input. float32/double.
+        x2 (Tensor): Second input. float32/double.
+        axis (int): Dimension of vectors to compute cosine similarity. Default is 1.
+        eps(float): Small value to avoid division by zero. Default is 1e-8.
+                    
+    Returns: a Tensor representing cosine similarity between x1 and x2 along axis.
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: text
+            Case 0:
+                x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
+                     [0.48949873 0.5797396  0.65444374 0.66510963]
+                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
+                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                     [0.4427798  0.54136837 0.5276275  0.32394758]
+                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                axis = 1
+                eps = 1e-8
+                Out: [0.5275037  0.8368967  0.75037485 0.9245899]
+
+    Code Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            np.random.seed(0)
+            x1 = np.random.rand(2,3)
+            x2 = np.random.rand(2,3)
+            x1 = paddle.to_tensor(x1)
+            x2 = paddle.to_tensor(x2)
+            result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
+            print(result.numpy())
+            # [0.99806249 0.9817672  0.94987036]
+            
+    """
+    w12 = sum(elementwise_mul(x1, x2), axis=axis)
+    w1 = sum(elementwise_mul(x1, x1), axis=axis)
+    w2 = sum(elementwise_mul(x2, x2), axis=axis)
+    n12 = sqrt(clip(w1 * w2, min=eps * eps))
+    cos_sim = w12 / n12
+    return cos_sim
+
+
+def linear(x, weight, bias=None, name=None):
+    """
+
+    Fully-connected linear transformation op
+
+    .. math::
+
+        Out = {XW + b}
+
+    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+
+    The linear op multiplies input tensor with weight matrix and
+    produces an output Tensor of shape [N, *, output_dim], 
+    where N is batch size and `*` means any number of additional dimensions and output_dim is the last dim of ``weight``.
+    If ``bias`` is not None, a bias will be added to the output.
+
+    Args:
+        x(Tensor): Input tensor, its data type is float16, float32 or float64
+        weight(Tensor): Weight tensor, its data type is float16, float32 or float64
+        bias(Tensor|None, optional): Bias tensor, its data type is float16, float32 or float64. If it is set to None, no bias will be added to the output units.
+        name(str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+        Output tensor
+
+    Examples:
+        .. code-block:: python
+          
+          import numpy as np
+          import paddle
+          import paddle.nn.functional as F
+          
+          input = np.ones((3,1,2), dtype=np.float32)
+          weight = np.ones((2,2), dtype=np.float32)
+          bias = np.ones((2), dtype=np.float32)
+          place = paddle.CPUPlace()
+          paddle.disable_static(place)
+          input = paddle.to_tensor(input)
+          weight = paddle.to_tensor(weight)
+          bias = paddle.to_tensor(bias)
+          out = F.linear(input, weight, bias)
+          print(out) #[3 3 3 3 3 3]
+    
+    """
+    if in_dygraph_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(
+            pre_bias, bias, axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='matmul', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [tmp],
+                        'Y': [bias]},
+                outputs={'Out': [res]},
+                attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 2a519718258856fe1f4462422a36dccae7066ad1..f80f200c7163836252faa4b1c932178f6bab0dff 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -13,15 +13,24 @@
 # limitations under the License.
 from __future__ import print_function
 
-__all__ = ['conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose']
+__all__ = [
+    'conv1d',
+    'conv_transpose1d',
+    'conv2d',
+    'conv_transpose2d',
+    'conv3d',
+    'conv_transpose3d',
+]
 
 import numpy as np
+from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
 from ...fluid import core, dygraph_utils
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
+from .common import pad2d
 
 
 def _is_list_or_tuple(input):
@@ -87,20 +96,242 @@ def _update_padding_nd(padding, channel_last, num_dims):
     return padding, padding_algorithm
 
 
-def conv2d(input,
+def conv1d(x,
            weight,
            bias=None,
+           stride=1,
            padding=0,
+           dilation=1,
+           groups=1,
+           data_format='NCL',
+           name=None):
+    """
+    The convolution1D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCL format, where N is batch size, C is the number of
+    channels, L is the length of the feature.
+    Filter is in MCK format, where M is the number of output image channels,
+    C is the number of input image channels, K is the size of the kernel.
+    If the groups is greater than 1, C will equal the number of input image
+    channels divided by the groups. If bias attribution and activation type
+    are provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCL format.
+    * :math:`W`: Kernel value, a tensor with MCK format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+
+    Args:
+        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type 
+            of input is float16 or float32 or float64.
+        weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
+            the number of output channels, g is the number of groups, K is the kernel's size. 
+        bias (Tensor, optional): The bias with shape [M,]. Default: None.
+        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+            contain one integers, (stride_size). Default: 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
+            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
+            4. a list[int] or tuple[int] whose length is 2. It has the form  [pad_before, pad_after].
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain one integer, (dilation_size). Default: 1.
+        groups (int, optional): The groups number of the conv1d function. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
+            The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, feature_length]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+
+    Returns:
+        A tensor representing the conv1d, whose data type is the 
+        same with input.
+
+    Raises:
+        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If `data_format` is not "NCL" or "NLC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ShapeError: If the input is not 3-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
+        ShapeError: If the number of input channels is not equal to filter's channels * groups.
+        ShapeError: If the number of output channels is not be divided by groups.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          x = np.array([[[4, 8, 1, 9],
+            [7, 2, 0, 9],
+            [6, 9, 2, 6]]]).astype(np.float32)
+          w=np.array(
+          [[[9, 3, 4],
+            [0, 0, 7],
+            [2, 5, 6]],
+           [[0, 3, 4],
+            [2, 9, 7],
+            [5, 6, 8]]]).astype(np.float32)
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv1d(x_var, w_var)
+          y_np = y_var.numpy()
+          print(y_np)
+          
+          # [[[133. 238.]
+          #   [160. 211.]]]
+    """
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if data_format not in ["NCL", "NLC"]:
+        raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+
+    channel_last = (data_format == "NHWC")
+    channel_dim = -1 if channel_last else 1
+    conv2d_data_format = "NHWC" if channel_last else "NCHW"
+    num_channels = x.shape[channel_dim]
+    num_filters = weight.shape[0]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             x.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, x.shape, groups))
+    if num_filters % groups != 0:
+        raise ValueError(
+            "the number of filters must be divisible by groups,"
+            "received: the number of filters is {}, the shape of weight is {}"
+            ", the groups is {}".format(num_filters, weight.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+    if len(padding) == 2:
+        padding = padding + [0] * 2
+    elif len(padding) == 1:
+        padding = padding + [0]
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            format(padding))
+
+    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
+    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+
+    l_type = "conv2d"
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+        use_cudnn = False
+
+    inputs = {'Input': [x], 'Filter': [weight]}
+    attrs = {
+        'strides': stride,
+        'paddings': padding,
+        'dilations': dilation,
+        'groups': groups,
+        'use_cudnn': use_cudnn,
+        'use_mkldnn': False,
+        'fuse_relu_before_depthwise_conv': False,
+        "padding_algorithm": padding_algorithm,
+        "data_format": conv2d_data_format
+    }
+    squeeze_aixs = -2 if channel_last else -1
+    x = nn.unsqueeze(input=x, axes=[squeeze_aixs])
+    weight = nn.unsqueeze(input=weight, axes=[-1])
+    if in_dygraph_mode():
+        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
+                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
+                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
+                 padding_algorithm, "data_format", conv2d_data_format)
+        out = getattr(core.ops, l_type)(x, weight, *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    else:
+        inputs = {'Input': [x], 'Filter': [weight]}
+        attrs = {
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": conv2d_data_format
+        }
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'conv2d')
+        helper = LayerHelper(l_type, **locals())
+        dtype = helper.input_dtype()
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [out]}
+        helper.append_op(
+            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    out = nn.squeeze(input=out, axes=[squeeze_aixs])
+    return out
+
+
+def conv2d(x,
+           weight,
+           bias=None,
            stride=1,
+           padding=0,
            dilation=1,
            groups=1,
-           use_cudnn=True,
-           act=None,
            data_format="NCHW",
            name=None):
     """
-	:alias_main: paddle.nn.functional.conv2d
-	:alias: paddle.nn.functional.conv2d,paddle.nn.functional.conv.conv2d
 
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -152,12 +383,15 @@ def conv2d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type 
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
             of input is float16 or float32 or float64.
-        weight (Variable): The convolution kernel with shape [M, C/g, kH, kW], where M is
+        weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width. 
-        bias (Variable, optional): The bias with shape [M,].
+        bias (Tensor, optional): The bias with shape [M,].
+        stride (int|tuple): The stride size. It means the stride in convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
@@ -168,9 +402,6 @@ def conv2d(input,
             when `data_format` is `"NHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
             points. If dilation is a tuple, it must contain two integers, (dilation_height, 
             dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
@@ -180,10 +411,6 @@ def conv2d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -193,13 +420,9 @@ def conv2d(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv2d, whose data type is the 
-        same with input. If act is None, the tensor variable storing the convolution 
-        result, and if act is not None, the tensor variable storing convolution 
-        and non-linearity activation result.
+        A Tensor representing the conv2d result, whose data type is the same with input. 
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If the channel dimmention of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -214,62 +437,65 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          from paddle import fluid
+          import paddle
           import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
           import numpy as np
 
           x = np.random.randn(2, 3, 8, 8).astype(np.float32)
           w = np.random.randn(6, 3, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv2d(x_var, w_var)
+          y_np = y_var.numpy()
+
           print(y_np.shape)
 
           # (2, 6, 6, 6)
     """
     # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
     channel_last = (data_format == "NHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError("The channel dimmention of the input({}) "
                          "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
+                             x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups))
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
             ", the groups is {}".format(num_filters, weight.shape, groups))
 
+    # use_cudnn = True if core.is_compiled_with_cuda() else False
+    cudnn_version = get_cudnn_version()
+
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
+
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        use_cudnn = False
 
-    inputs = {'Input': [input], 'Filter': [weight]}
+    inputs = {'Input': [x], 'Filter': [weight]}
     attrs = {
         'strides': stride,
         'paddings': padding,
@@ -287,15 +513,13 @@ def conv2d(input,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
                  padding_algorithm, "data_format", data_format)
-        pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
+        pre_bias = getattr(core.ops, l_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
             'strides': stride,
             'paddings': padding,
@@ -307,8 +531,8 @@ def conv2d(input,
             "padding_algorithm": padding_algorithm,
             "data_format": data_format
         }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv2d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv2d')
         helper = LayerHelper(l_type, **locals())
         dtype = helper.input_dtype()
         pre_bias = helper.create_variable_for_type_inference(dtype)
@@ -316,28 +540,279 @@ def conv2d(input,
         helper.append_op(
             type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
+
     return out
 
 
-def conv2d_transpose(input,
+def conv_transpose1d(x,
                      weight,
                      bias=None,
-                     output_size=None,
-                     padding=0,
                      stride=1,
+                     padding=0,
+                     output_padding=0,
+                     groups=1,
                      dilation=1,
+                     output_size=None,
+                     data_format="NCL",
+                     name=None):
+    """
+    The 1-D convolution transpose layer calculates the output based on the input,
+    filter, and dilation, stride, padding. Input(Input) and output(Output)
+    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
+    L is the length of the feature. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a 3-D Tensor with 'NCL' format or 'NLC' format.
+    * :math:`W`: Filter value, a 3-D Tensor with 'MCK' format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 3-D Tensor with data format 'NCL' or 'NLC', the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+           L^\prime_{out} &= (L_{in} - 1) * stride - pad_top - pad_bottom + dilation * (L_f - 1) + 1 + output_padding \\\\
+           L_{out} &\in [ L^\prime_{out}, L^\prime_{out} + stride ]
+
+    Note:
+          The conv1d_transpose can be seen as the backward of the conv1d. For conv1d,
+          when stride > 1, conv1d maps multiple input shape to the same output shape,
+          so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`L_{out} = L^\prime_{out}`;
+          else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
+          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+
+    Args:
+        x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
+                         its data type is float32 or float64.
+        weight(Tensor): The convolution kernel, a Tensor with shape [C, M/g, K],
+            where M is the number of output channels(filters), g is the number of groups,
+            K is the size of the kernel.
+        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
+        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain one integer, `(stride_size)`.
+            Default: stride = 1.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in two forms:
+             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
+        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
+             If it is a tuple, it must contain one integer. Default: 0.
+        groups(int, optional): The groups number of the conv1d transpose function. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a tuple, it must contain one integer, `(dilation_size)`.
+            Default: dilation = 1.
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain one integer, `(feature_length)`. None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None. output_size and filter_size
+            should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
+            The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+
+    Returns:
+        A  tensor representing the result of 1-D transpose convolution, whose
+        data type is the same with input. And its shape is (num_batches, channels, length)
+        when data_format is `"NCL"` and (num_batches, length, channels) when data_format is
+        `"NLC"`.
+
+    Raises:
+        ValueError: If `data_format` is a string, but not "NCL" or "NLC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_padding` is greater than `stride`.
+        ShapeError: If the input is not 3-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
+        ShapeError: If the number of input channels is not equal to filter's channels.
+        ShapeError: If the size of `output_size` is not equal to that of `stride`.
+
+    Examples:
+        .. code-block:: python
+
+
+
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          
+          paddle.disable_static()
+          # shape: (1, 2, 4)
+          x=np.array([[[4, 0, 9, 7],
+                       [8, 0, 9, 2,]]]).astype(np.float32)
+          # shape: (2, 1, 2)
+          y=np.array([[[7, 0]],
+                      [[4, 2]]]).astype(np.float32)
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv_transpose1d(x_var, w_var)
+          y_np = y_var.numpy()
+          print y_np
+          
+          # [[[60. 16. 99. 75.  4.]]]
+    """
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if data_format not in ['NCL', 'NLC']:
+        raise ValueError(
+            "Attr(data_format) of conv2d_transpose got wrong value: "
+            "received {}, but only 'NCL' or 'NLC' are supported.".format(
+                data_format))
+    channel_last = (data_format == "NLC")
+    channel_dim = -1 if channel_last else 1
+
+    num_channels = x.shape[channel_dim]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             x.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, x.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+
+    if len(padding) == 2:
+        padding = padding + [0] * 2
+    elif len(padding) == 1:
+        padding = padding + [0]
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            format(padding))
+
+    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
+    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+    output_padding = utils.convert_to_list(output_padding, 1,
+                                           'output_padding') + [0]
+    if output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
+
+    if output_size is None:
+        output_size = []
+    elif isinstance(output_size, (list, tuple, int)):
+        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
+    else:
+        raise ValueError("output_size should be int, or list, tuple of ints")
+
+    op_type = 'conv2d_transpose'
+    num_filters = weight.shape[1]
+    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+        use_cudnn = False
+
+    squeeze_axis = -2 if channel_last else -1
+    conv2d_data_format = "NHWC" if channel_last else "NCHW"
+
+    x = nn.unsqueeze(input=x, axes=[squeeze_axis])
+    weight = nn.unsqueeze(input=weight, axes=[-1])
+
+    if in_dygraph_mode():
+        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
+                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
+                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
+                 'data_format', conv2d_data_format)
+        out = getattr(core.ops, op_type)(x, weight, *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    else:
+        inputs = {'Input': [x], 'Filter': [weight]}
+        attrs = {
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': conv2d_data_format
+        }
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'conv2d_transpose')
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype()
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+
+    if output_size is None:
+        out = pad2d(
+            out,
+            padding=[0, output_padding, 0, 0],
+            data_format=conv2d_data_format,
+            name=name)
+    out = nn.squeeze(input=out, axes=[squeeze_axis])
+    return out
+
+
+def conv_transpose2d(x,
+                     weight,
+                     bias=None,
+                     stride=1,
+                     padding=0,
+                     output_padding=0,
                      groups=1,
-                     use_cudnn=True,
-                     act=None,
+                     dilation=1,
                      data_format='NCHW',
+                     output_size=None,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.conv2d_transpose
-	:alias: paddle.nn.functional.conv2d_transpose,paddle.nn.functional.conv.conv2d_transpose
 
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -350,6 +825,7 @@ def conv2d_transpose(input,
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
+    See more detail in :ref:`api_nn_conv_ConvTranspose2d` .
 
     For each input :math:`X`, the equation is:
 
@@ -398,18 +874,15 @@ def conv2d_transpose(input,
           conv2d_transpose can compute the kernel size automatically.
 
     Args:
-        input(Variable): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
+        x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
             whose data type is float32 or float64.
-        weight(Variable): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
+        weight(Tensor): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
             where M is the number of output channels(filters), g is the number of groups,
             kH is the height of the kernel, and kW is the width of the kernel.
-        bias(Variable, optional): The bias, a Tensor with shape [M, ].
-        output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
              string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
@@ -421,10 +894,9 @@ def conv2d_transpose(input,
             when `data_format` is `'NHWC'`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
@@ -433,10 +905,12 @@ def conv2d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_height, image_width). None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size is specified, output_size and filter_size (weight)'s shape 
+            should follow the formula above. Default: None. output_size and filter_size 
+            should not be None at the same time.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -446,20 +920,17 @@ def conv2d_transpose(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv2d_transpose, whose 
+        A Tensor representing the conv_transpose2d, whose 
         data type is the same with input and shape is (num_batches, channels, out_h, 
-        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable 
-        storing the transposed convolution result, and if act is not None, the 
-        tensor variable storing transposed convolution and non-linearity activation 
-        result.
+        out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing 
+        transposed convolution result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 4-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
         ShapeError: If the dimension size of input minus the size of `stride` is not 2.
@@ -469,28 +940,23 @@ def conv2d_transpose(input,
     Examples:
         .. code-block:: python
 
-          from paddle import fluid
-          import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
           import numpy as np
+          import paddle
+          import paddle.nn.functional as F
 
           x = np.random.randn(2, 3, 8, 8).astype(np.float32)
           w = np.random.randn(3, 6, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv2d_transpose(x_var, w_var)
+          y_np = y_var.numpy()
           print(y_np.shape)
 
           # (2, 6, 10, 10)
     """
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
@@ -498,48 +964,65 @@ def conv2d_transpose(input,
                 data_format))
     channel_last = (data_format == "NHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError("The channel dimmention of the input({}) "
                          "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
+                             x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups))
+
+    cudnn_version = get_cudnn_version()
+
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
+
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 2, 'output_size')
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 2,
+                                               'output_padding')
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+    if (num_channels == groups and num_filters == 1):
         op_type = 'depthwise_conv2d_transpose'
+        use_cudnn = False
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', data_format)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -549,37 +1032,32 @@ def conv2d_transpose(input,
             'use_cudnn': use_cudnn,
             'data_format': data_format
         }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'],
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'conv2d_transpose')
         helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        pre_bias = helper.create_variable_for_type_inference(dtype)
+        pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
+
     return out
 
 
-def conv3d(input,
+def conv3d(x,
            weight,
            bias=None,
-           padding=0,
            stride=1,
+           padding=0,
            dilation=1,
            groups=1,
-           use_cudnn=True,
-           act=None,
            data_format="NCDHW",
            name=None):
     """
-	:alias_main: paddle.nn.functional.conv3d
-	:alias: paddle.nn.functional.conv3d,paddle.nn.functional.conv.conv3d
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -625,12 +1103,15 @@ def conv3d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
+        x (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
             type of input is float16 or float32 or float64.
         weight (Variable): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
+        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
+        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
+            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
@@ -641,9 +1122,6 @@ def conv3d(input,
             when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
-            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
@@ -653,10 +1131,6 @@ def conv3d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -666,13 +1140,12 @@ def conv3d(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv3d, whose data type is 
+        A Tensor representing the conv3d, whose data type is 
         the same with input. If act is None, the tensor variable storing the 
         convolution result, and if act is not None, the tensor variable storing 
         convolution and non-linearity activation result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If the channel dimmention of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -706,10 +1179,6 @@ def conv3d(input,
             # (2, 6, 6, 6, 6)
     """
     # entry check
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): {}. ".format(use_cudnn))
-
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
@@ -717,12 +1186,12 @@ def conv3d(input,
 
     channel_last = (data_format == "NDHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
@@ -734,6 +1203,10 @@ def conv3d(input,
             "Received: number of filters({}), groups({}).".format(num_filters,
                                                                   groups))
 
+    cudnn_version = get_cudnn_version()
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
+
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = utils.convert_to_list(stride, 3, 'stride')
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
@@ -744,15 +1217,13 @@ def conv3d(input,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  "padding_algorithm", padding_algorithm, "data_format",
                  data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
             'strides': stride,
             'paddings': padding,
@@ -765,8 +1236,8 @@ def conv3d(input,
         }
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv3d')
 
         pre_bias = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [pre_bias]}
@@ -774,31 +1245,26 @@ def conv3d(input,
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
 
     return out
 
 
-def conv3d_transpose(input,
+def conv_transpose3d(x,
                      weight,
                      bias=None,
-                     output_size=None,
-                     padding=0,
                      stride=1,
-                     dilation=1,
+                     padding=0,
+                     output_padding=0,
                      groups=1,
-                     use_cudnn=True,
-                     act=None,
+                     dilation=1,
                      data_format='NCDHW',
+                     output_size=None,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.conv3d_transpose
-	:alias: paddle.nn.functional.conv3d_transpose,paddle.nn.functional.conv.conv3d_transpose
-
-    The convolution3D transpose layer calculates the output based on the input,
+    The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
     D is the depth of the feature, H is the height of the feature, and W
@@ -809,6 +1275,7 @@ def conv3d_transpose(input,
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
+    See more detail in :ref:`api_nn_conv_ConvTranspose3d` .
 
     For each input :math:`X`, the equation is:
 
@@ -861,17 +1328,16 @@ def conv3d_transpose(input,
           conv3d_transpose can compute the kernel size automatically.
 
     Args:
-        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
+        x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
             of input is float32 or float64.
-        weight (Variable): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
+        weight (Tensor): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
-        output_size(int|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
              adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
              either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
@@ -882,11 +1348,9 @@ def conv3d_transpose(input,
             when `data_format` is `'NDHWC'`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
@@ -896,32 +1360,32 @@ def conv3d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
+        output_size(int|list|tuple, optional): The output image size. If output size is a
+            tuple, it must contain three integers, (image_depth, image_height, image_width). This
+            parameter only works when filter_size is None. If output_size and filter_size are 
+            specified at the same time, They should follow the formula above. Default: None. 
+            Output_size and filter_size should not be None at the same time.
         name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv3d_transpose, whose data 
+        A Tensor representing the conv_transpose3d, whose data 
         type is the same with input and shape is (num_batches, channels, out_d, out_h, 
         out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
         variable storing the transposed convolution result, and if act is not None, the tensor 
         variable storing transposed convolution and non-linearity activation result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 5-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
         ShapeError: If the dimension size of input minus the size of `stride` is not 2.
@@ -930,29 +1394,26 @@ def conv3d_transpose(input,
 
     Examples:
        .. code-block:: python
+          
+          import numpy as np
 
-          from paddle import fluid
+          import paddle
           import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
-          import numpy as np
 
           x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
           w = np.random.randn(3, 6, 3, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv3d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv_transpose3d(x_var, w_var)
+          y_np = y_var.numpy()
           print(y_np.shape)
 
           # (2, 6, 10, 10, 10)
     """
     # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
@@ -960,12 +1421,12 @@ def conv3d_transpose(input,
 
     channel_last = (data_format == "NDHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
@@ -977,29 +1438,45 @@ def conv3d_transpose(input,
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 3, 'output_size')
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 3,
+                                               'output_padding')
+
+    cudnn_version = get_cudnn_version()
+
+    #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'paddings', padding,
-                 "padding_algorithm", padding_algorithm, 'strides', stride,
-                 'dilations', dilation, 'groups', groups, 'use_cudnn',
-                 use_cudnn, "data_format", data_format_)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'paddings', padding, "padding_algorithm", padding_algorithm,
+                 'strides', stride, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, "data_format", data_format_)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'paddings': padding,
             "padding_algorithm": padding_algorithm,
@@ -1010,19 +1487,17 @@ def conv3d_transpose(input,
             "data_format": data_format_
         }
         helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv3d')
 
-        pre_bias = helper.create_variable_for_type_inference(dtype)
+        pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
 
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
 
     return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77bf0e39672984f7076938b134f3e54f4c761ab
--- /dev/null
+++ b/python/paddle/nn/functional/input.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import warnings
+from ...fluid.framework import Variable, in_dygraph_mode
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.layers import core
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+
+__all__ = ['one_hot']
+
+
+def one_hot(x, num_classes, name=None):
+    """
+
+    The operator converts each id in the input 'x' to an one-hot vector with a
+    num_classes length. The value in the vector dimension corresponding to the id
+    is 1, and the value in the remaining dimension is 0.
+
+    The shape of output Tensor is generated by appending num_classes dimension
+    behind the last dimension of the 'x' shape.
+
+    .. code-block:: text
+
+        Example 1:
+
+        input:
+            x.shape = [4]
+            x.data = [1, 1, 3, 0]
+            num_classes = 4
+
+        output:
+            Out.shape = [4, 4]
+            Out.data = [[0., 1., 0., 0.],
+                        [0., 1., 0., 0.],
+                        [0., 0., 0., 1.],
+                        [1., 0., 0., 0.]]
+
+        Example 2:
+
+        input:
+            x.shape = [4]
+            x.data = [1, 1, 5, 0]
+            num_classes = 4
+
+        output: Throw an exception for Illegal value
+            The second dimension in X is 5, which is greater than num_classes,
+            so it throws an exception.
+
+
+    Args:
+        x(Tensor): Tensor with shape :math:`[N_1, N_2, ..., N_k]` ,
+            which contains at least one dimension. The data type is int32 or int64.
+        num_classes(int): An integer defining the num_classes of the one hot dimension. If input 'x'
+            is word id, num_classes is generally the dictionary size.
+
+    Returns:
+        Tensor: The one-hot representations of 'x'. A Tensor with type float32.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            # Correspond to the first example above, where label.shape is 4 and one_hot_label.shape is [4, 4].
+            label = paddle.data(name="label", shape=[4, 1], dtype="int64")
+            # label.shape = [4]
+            # label.data = [1, 1, 3, 0]
+            one_hot_label = paddle.nn.functional.one_hot(x=label, num_classes=4)
+            # one_hot_label.shape = [4, 4]
+            # one_hot_label.data = [[0., 1., 0., 0.],
+            #                       [0., 1., 0., 0.],
+            #                       [0., 0., 0., 1.],
+            #                       [1., 0., 0., 0.]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.one_hot_v2(x, 'depth', num_classes,
+                                   'allow_out_of_range', False)
+    else:
+        check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'one_hot_v2')
+        helper = LayerHelper("one_hot_v2", **locals())
+
+        one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
+        if not isinstance(num_classes, Variable):
+            # user attribute 
+            inputs = {'X': x}
+            attrs = {'depth': num_classes, 'allow_out_of_range': False}
+        else:
+            num_classes.stop_gradient = True
+            inputs = {'X': x, 'depth_tensor': num_classes}
+            attrs = {'allow_out_of_range': False}
+        helper.append_op(
+            type="one_hot_v2",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={'Out': one_hot_out},
+            stop_gradient=True)
+        return one_hot_out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index bc6d26370f0254b9349ff1bb871a1840ba26293d..55bb36d136405385a88b991576c2a9091437d456 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -12,17 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define loss functions of neural network  
+import paddle
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+import paddle.fluid as fluid
+
+# TODO: define loss functions of neural network
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from ...fluid.framework import core, in_dygraph_mode
+from ...fluid.layers.nn import _elementwise_op_in_dygraph
 from ...fluid.layers import bpr_loss  #DEFINE_ALIAS
 from ...fluid.layers import center_loss  #DEFINE_ALIAS
-from ...fluid.layers import cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import dice_loss  #DEFINE_ALIAS
 from ...fluid.layers import iou_similarity  #DEFINE_ALIAS
-from ...fluid.layers import kldiv_loss  #DEFINE_ALIAS
 from ...fluid.layers import log_loss  #DEFINE_ALIAS
-from ...fluid.layers import mse_loss  #DEFINE_ALIAS
 from ...fluid.layers import npair_loss  #DEFINE_ALIAS
 from ...fluid.layers import rank_loss  #DEFINE_ALIAS
+from ...fluid.layers import reshape
 from ...fluid.layers import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid_focal_loss  #DEFINE_ALIAS
 from ...fluid.layers import smooth_l1  #DEFINE_ALIAS
@@ -33,10 +41,15 @@ from ...fluid.layers import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
 from ...fluid.layers import huber_loss  #DEFINE_ALIAS
-from ...fluid.layers import margin_rank_loss  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.framework import _varbase_creator
+from ...fluid.framework import Variable
 
 __all__ = [
+    'binary_cross_entropy',
+    'binary_cross_entropy_with_logits',
     'bpr_loss',
     'center_loss',
     'cross_entropy',
@@ -44,19 +57,1101 @@ __all__ = [
     'edit_distance',
     'huber_loss',
     'iou_similarity',
-    'kldiv_loss',
+    'kl_div',
+    'l1_loss',
     'log_loss',
-    'margin_rank_loss',
     'mse_loss',
+    'margin_ranking_loss',
     #       'nce',
+    'nll_loss',
     'npair_loss',
     'rank_loss',
     'sampled_softmax_with_cross_entropy',
     'sigmoid_cross_entropy_with_logits',
     'sigmoid_focal_loss',
     'smooth_l1',
+    'smooth_l1_loss',
     'softmax_with_cross_entropy',
     'square_error_cost',
     'ssd_loss',
-    'teacher_student_sigmoid_loss'
+    'teacher_student_sigmoid_loss',
+    'ctc_loss',
 ]
+
+
+def binary_cross_entropy(input, label, weight=None, reduction='mean',
+                         name=None):
+    """
+    This op measures the binary_cross_entropy loss between input predictions ``input``
+    and target labels ``label`` . The binary_cross_entropy loss can be described as:
+
+    If :attr:`weight` is set, the loss is:
+
+    .. math::
+        Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
+
+    If :attr:`weight` is None, the loss is:
+
+    .. math::
+        Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
+
+    If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(Out)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(Out)
+
+    Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
+    should be numbers between 0 and 1.
+
+    Parameters:
+        input (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``input``
+            should always be the output of sigmod.  Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``input``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, has to be a Tensor of size nbatch and the data type
+            is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+
+    Returns:
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``input`` , else the shape of output is scalar.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
+            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
+
+            paddle.disable_static()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = paddle.nn.functional.binary_cross_entropy(input, label)
+            print(output.numpy())  # [0.65537095]
+            paddle.enable_static()
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in binary_cross_entropy should be 'sum', "
+            "'mean' or 'none', but received %s, which is not allowed." %
+            reduction)
+
+    if in_dygraph_mode():
+        out = core.ops.bce_loss(input, label)
+        if weight is not None:
+            out = core.ops.elementwise_mul(out, weight, 'axis', -1)
+
+        if reduction == 'sum':
+            return core.ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
+                                       "reduce_all", True)
+        elif reduction == 'mean':
+            return core.ops.reduce_mean(out, 'dim', [0], 'keep_dim', False,
+                                        "reduce_all", True)
+        else:
+            return out
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'binary_cross_entropy')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
+
+    sub_name = name if weight is None and reduction is 'none' else None
+    helper = LayerHelper("binary_cross_entropy", name=sub_name)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='bce_loss',
+        inputs={
+            'X': [input],
+            'Label': [label],
+        },
+        outputs={'Out': [out]})
+
+    if weight is not None:
+        if isinstance(weight, paddle.framework.Variable):
+            weight_name = name if reduction is 'none' else None
+            out = paddle.multiply(out, weight, axis=-1, name=weight_name)
+        else:
+            raise ValueError(
+                "The weight is not a Tensor, please convert to Tensor.")
+
+    if reduction == 'sum':
+        return paddle.sum(out, name=name)
+    elif reduction == 'mean':
+        return paddle.mean(out, name=name)
+    else:
+        return out
+
+
+def binary_cross_entropy_with_logits(logit,
+                                     label,
+                                     weight=None,
+                                     reduction='mean',
+                                     pos_weight=None,
+                                     name=None):
+    """
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+
+    First this operator calculate loss function as follows:
+
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+
+    Args:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
+            with length equal to the number of classes. The data type is float32, float64.
+            Default is ``'None'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``logit`` , else the shape of output is scalar.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
+            print(output.numpy())  # [0.45618808]
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in binary_cross_entropy_with_logits "
+            "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
+            % reduction)
+
+    if in_dygraph_mode():
+        one = _varbase_creator(dtype=logit.dtype)
+        core.ops.fill_constant(one, 'value',
+                               float(1.0), 'force_cpu', False, 'dtype',
+                               one.dtype, 'str_value', '1.0', 'shape', [1])
+        out = core.ops.sigmoid_cross_entropy_with_logits(logit, label)
+        if pos_weight is not None:
+            log_weight = core.ops.elementwise_add(
+                core.ops.elementwise_mul(
+                    label, core.ops.elementwise_sub(pos_weight, one)), one)
+            out = core.ops.elementwise_mul(out, log_weight)
+        if weight is not None:
+            out = core.ops.elementwise_mul(out, weight)
+
+        if reduction == "sum":
+            return core.ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == "mean":
+            return core.ops.mean(out)
+        else:
+            return out
+
+    fluid.data_feeder.check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'],
+        'binary_cross_entropy_with_logits')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'],
+        'binary_cross_entropy_with_logits')
+    sigmoid_name = None
+    if reduction == 'none' and pos_weight is None and weight is None:
+        sigmoid_name = name
+
+    out = paddle.nn.functional.sigmoid_cross_entropy_with_logits(
+        logit, label, name=sigmoid_name)
+
+    one = paddle.fill_constant(shape=[1], value=1.0, dtype=logit.dtype)
+    if pos_weight is not None:
+        fluid.data_feeder.check_variable_and_dtype(
+            pos_weight, 'pos_weight', ['float32', 'float64'],
+            'binary_cross_entropy_with_logits')
+        log_weight = paddle.add(
+            paddle.multiply(label, paddle.elementwise_sub(pos_weight, one)),
+            one)
+        pos_weight_name = name if reduction == 'none' and weight is None else None
+        out = paddle.multiply(out, log_weight, name=pos_weight_name)
+
+    if weight is not None:
+        fluid.data_feeder.check_variable_and_dtype(
+            weight, 'weight', ['float32', 'float64'],
+            'binary_cross_entropy_with_logits')
+        weight_name = name if reduction == 'none' else None
+        out = paddle.multiply(out, weight, name=weight_name)
+
+    if reduction == "sum":
+        return paddle.sum(out, name=name)
+    elif reduction == "mean":
+        return paddle.mean(out, name=name)
+    return out
+
+
+def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
+    """
+    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
+    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
+    In some cases it can prevent exploding gradients and it is more robust and less
+    sensitivity to outliers. Also known as the Huber loss:
+
+    .. math::
+
+         loss(x,y)=\\frac{1}{n}\\sum_{i}z_i
+
+
+    where z_i is given by:
+
+    .. math::
+
+         \\mathop{z_i}=\\left\\{\\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
+        \\end{array} \\right.
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label
+            is the same as the shape of input.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the errors need to be to use L1. Errors
+            smaller than delta are minimized with L2. Parameter is ignored for
+            negative/zero values. Default = 1.0
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        The tensor variable storing the smooth_l1_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.random.rand(3,3).astype("float32")
+            label_data = np.random.rand(3,3).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = paddle.nn.functioanl.smooth_l1_loss(input, label)
+            print(output.numpy())
+    """
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'smooth_l1_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'smooth_l1_loss')
+
+    out = huber_loss(input=input, label=label, delta=delta)
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in smooth_l1_loss should be 'sum', 'mean' or"
+            " 'none', but received %s, which is not allowed." % reduction)
+    if reduction == 'none':
+        return out
+    elif reduction == 'mean':
+        return fluid.layers.reduce_mean(out)
+    elif reduction == 'sum':
+        return fluid.layers.reduce_sum(out)
+
+
+def margin_ranking_loss(input,
+                        other,
+                        label,
+                        margin=0.0,
+                        reduction='mean',
+                        name=None):
+    """
+
+    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
+
+    .. math::
+        margin\_rank\_loss = max(0, -label * (input - other) + margin)
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'none'``, just return the origin ``margin_rank_loss``.
+
+    Parameters:
+        input(Tensor): the first input tensor, it's data type should be float32, float64.
+        other(Tensor): the second input tensor, it's data type should be float32, float64.
+        label(Tensor): the label value corresponding to input, it's data type should be float32, float64.
+        margin (float, optional): The margin value to add, default value is 0;
+        reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns: Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype('float32'))
+            other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype('float32'))
+            label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype('float32'))
+            loss = paddle.nn.functional.margin_ranking_loss(input, other, label)
+            print(loss.numpy()) # [0.75]
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction)
+    if fluid.framework.in_dygraph_mode():
+        out = core.ops.elementwise_sub(other, input)
+        out = core.ops.elementwise_mul(out, label)
+        if margin != 0.0:
+            margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
+            out = core.ops.elementwise_add(out, margin)
+        out = core.ops.relu(out)
+        if reduction == 'sum':
+            return core.ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == 'mean':
+            return core.ops.mean(out)
+        return out
+
+    helper = LayerHelper("margin_ranking_loss", **locals())
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'margin_rank_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        other, 'other', ['float32', 'float64'], 'margin_rank_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'margin_rank_loss')
+
+    out = paddle.elementwise_sub(other, input)
+    out = paddle.multiply(out, label)
+
+    if margin != 0.0:
+        margin_var = out.block.create_var(dtype=out.dtype)
+        paddle.fill_constant([1], out.dtype, margin, out=margin_var)
+        out = paddle.add(out, margin_var)
+
+    result_out = helper.create_variable_for_type_inference(input.dtype)
+
+    if reduction == 'none':
+        helper.append_op(
+            type="relu", inputs={"X": out}, outputs={"Out": result_out})
+        return result_out
+    elif reduction == 'sum':
+        out = paddle.nn.functional.relu(out)
+        attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
+        helper.append_op(
+            type="reduce_sum",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs=attrs)
+        return result_out
+    elif reduction == 'mean':
+        out = paddle.nn.functional.relu(out)
+        helper.append_op(
+            type="mean",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs={})
+        return result_out
+
+
+def l1_loss(input, label, reduction='mean', name=None):
+    """
+    This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
+
+    If `reduction` set to ``'none'``, the loss is:
+
+    .. math::
+        Out = \lvert input - label\rvert
+
+    If `reduction` set to ``'mean'``, the loss is:
+
+    .. math::
+        Out = MEAN(\lvert input - label\rvert)
+
+    If `reduction` set to ``'sum'``, the loss is:
+
+    .. math::
+        Out = SUM(\lvert input - label\rvert)
+
+
+    Parameters:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        reduction (str, optional): Indicate the reduction to apply to the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+    Returns:
+        Tensor, the L1 Loss of Tensor ``input`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+
+            l1_loss = paddle.nn.functional.l1_loss(input, label)
+            print(l1_loss.numpy())
+            # [0.35]
+
+            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='none')
+            print(l1_loss.numpy())
+            # [[0.20000005 0.19999999]
+            # [0.2        0.79999995]]
+
+            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+            print(l1_loss.numpy())
+            # [1.4]
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction)
+
+    if in_dygraph_mode():
+        unreduced = _elementwise_op_in_dygraph(
+            input, label, axis=-1, act='abs', op_name='elementwise_sub')
+        if reduction == 'mean':
+            return core.ops.mean(unreduced)
+        elif reduction == 'sum':
+            return core.ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim', False,
+                                       'reduce_all', True)
+        else:
+            return unreduced
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
+
+    if reduction == 'sum':
+        unreduced = paddle.elementwise_sub(input, label, act='abs')
+        return paddle.sum(unreduced, name=name)
+    elif reduction == 'mean':
+        unreduced = paddle.elementwise_sub(input, label, act='abs')
+        return paddle.mean(unreduced, name=name)
+    else:
+        return paddle.elementwise_sub(input, label, act='abs', name=name)
+
+
+def nll_loss(input,
+             label,
+             weight=None,
+             ignore_index=-100,
+             reduction='mean',
+             name=None):
+    """
+    This api returns negative log likelihood.
+    See more detail in :ref:`api_nn_loss_NLLLoss` .
+
+    Parameters:
+         input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
+             But in K-dimension situation, the shape is :math:`[N, C, d_1, d_2, ..., d_K]`.
+             The data type is float32, float64.
+         label (Tensor): Label tensor, the shape is :math:`[N,]` or :math:`[N, d_1, d_2, ..., d_K]`.
+             The data type is int64.
+         weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+             to each class. If given, it has to be a 1D Tensor whose size is `[C, ]`. Otherwise,
+             it treated as if having all ones. the data type is
+             float32, float64, Default is ``'None'``.
+         ignore_index (int64, optional): Specifies a target value that is ignored
+             and does not contribute to the input gradient.
+         reduction (str, optional): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
+         name (str, optional): Name for the operation (optional, default is None).
+             For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+         `Tensor`, the value of negative log likelihood loss.
+
+    Examples:
+        .. code-block:: python
+                import paddle
+                import numpy as np
+                from paddle.nn.functional import nll_loss
+                log_softmax = paddle.nn.LogSoftmax(axis=1)
+
+                input_np = np.array([[0.88103855, 0.9908683 , 0.6226845 ],
+                                     [0.53331435, 0.07999352, 0.8549948 ],
+                                     [0.25879037, 0.39530203, 0.698465  ],
+                                     [0.73427284, 0.63575995, 0.18827209],
+                                     [0.05689114, 0.0862954 , 0.6325046 ]]).astype(np.float32)
+                label_np = np.array([0, 2, 1, 1, 0]).astype(np.int64)
+
+                place = paddle.CPUPlace()
+                paddle.disable_static(place)
+                input = paddle.to_variable(input_np)
+                log_out = log_softmax(input)
+                label = paddle.to_variable(label_np)
+                result = nll_loss(log_out, label)
+                print(result.numpy()) # [1.0720209]
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
+            "'none', but received %s, which is not allowed." % reduction)
+
+    input_shape = list(input.shape)
+    input_dims = len(input_shape)
+    if input_dims < 2:
+        raise ValueError('Expected 2 or more dimensions (got {})'.format(
+            input_dims))
+    n = input_shape[0]
+    c = input_shape[1]
+    if in_dygraph_mode():
+        if input_dims != 2 and input_dims != 4:
+            input, _ = core.ops.reshape2(input, 'shape', [n, c, 1, -1])
+            label, _ = core.ops.reshape2(label, 'shape', [n, 1, -1])
+            out_shape = [n] + input_shape[2:]
+        out, total_weight = core.ops.nll_loss(input, label, weight,
+                                              'ignore_index', ignore_index,
+                                              'reduction', reduction)
+        if input_dims != 2 and input_dims != 4 and reduction == 'none':
+            out, _ = core.ops.reshape2(out, 'shape', out_shape)
+        return out
+
+    helper = LayerHelper('nll_loss', **locals())
+
+    if input_dims != 2 and input_dims != 4:
+        input = reshape(input, shape=[n, c, 1, -1])
+        label = reshape(label, shape=[n, 1, -1])
+        out_shape = [n] + input_shape[2:]
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'nll_loss')
+    fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                               'nll_loss')
+    inputs = {'X': input, 'Label': label}
+    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+    if weight is not None:
+        if isinstance(weight, Variable):
+            inputs['Weight'] = weight
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
+    outputs = {'Out': out, 'Total_weight': total_weight}
+
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
+    if input_dims != 2 and input_dims != 4 and reduction == 'none':
+        out = reshape(out, shape=out_shape)
+
+    return out
+
+
+def kl_div(input, label, reduction='mean', name=None):
+    """
+    This operator calculates the Kullback-Leibler divergence loss
+    between Input(X) and Input(Target). Notes that Input(X) is the
+    log-probability and Input(Target) is the probability.
+
+    KL divergence loss is calculated as follows:
+
+    $$l(x, y) = y * (\log(y) - x)$$
+
+    While :math:`x` is input and :math:`y` is label.
+
+    While :attr:`reduction` is :attr:`none`, output loss is in
+    the same shape as input, loss in each point is calculated
+    seperately and no reduction is applied.
+
+    While :attr:`reduction` is :attr:`mean`, output loss is in
+    shape of [1] and loss value is the mean value of all losses.
+
+    While :attr:`reduction` is :attr:`sum`, output loss is in
+    shape of [1] and loss value is the sum value of all losses.
+
+    While :attr:`reduction` is :attr:`batchmean`, output loss is
+    in shape of [1] and loss value is the sum value of all losses
+    divided by batch size.
+
+    Args:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means
+             any number of additional dimensions. It's data type should be float32, float64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64.
+        reduction (Tensor): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
+        name(str, optional): Name for the operation (optional, default is None). For more information,
+            please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The KL divergence loss. The data type is same as input tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            import paddle.nn.functional as F
+
+            paddle.enable_imperative()
+
+            shape = (5, 20)
+            input = np.random.uniform(-10, 10, shape).astype('float32')
+            target = np.random.uniform(-10, 10, shape).astype('float32')
+
+            # 'batchmean' reduction, loss shape will be [N]
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='batchmean')
+            # shape=[5]
+
+            # 'mean' reduction, loss shape will be [1]
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='mean')
+            # shape=[1]
+
+            # 'sum' reduction, loss shape will be [1]
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='sum')
+            # shape=[1]
+
+            # 'none' reduction, loss shape is same with input shape
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='none')
+            # shape=[5, 20]
+
+    """
+    if paddle.in_dynamic_mode():
+        out = core.ops.kldiv_loss(input, label, 'reduction', reduction)
+        return out
+
+    helper = LayerHelper('kl_div', **locals())
+
+    fluid.data_feeder.check_variable_and_dtype(input, 'input',
+                                               ['float32', 'float64'], 'kl_div')
+    fluid.data_feeder.check_variable_and_dtype(label, 'label',
+                                               ['float32', 'float64'], 'kl_div')
+    fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
+
+    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='kldiv_loss',
+        inputs={'X': input,
+                'Target': label},
+        outputs={'Loss': loss},
+        attrs={'reduction': reduction})
+    return loss
+
+
+def mse_loss(input, label, reduction='mean', name=None):
+    """
+    This op accepts input predications and label and returns the mean square error.
+
+    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
+
+    .. math::
+        Out = (input - label)^2
+
+    If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
+
+    .. math::
+        Out = \operatorname{mean}((input - label)^2)
+
+    If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
+
+    .. math::
+        Out = \operatorname{sum}((input - label)^2)
+
+    Parameters:
+        input (Tensor): Input tensor, the data type should be float32 or float64.
+        label (Tensor): Label tensor, the data type should be float32 or float64.
+        reduction (string, optional): The reduction method for the output,
+            could be 'none' | 'mean' | 'sum'.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+
+    Returns:
+        Tensor: The tensor tensor storing the mean square error difference of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+
+        .. code-block:: python
+            import numpy as np
+            import paddle
+
+
+            # static graph mode
+            paddle.enable_static()
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.data(name="input", shape=[1])
+            label = paddle.data(name="label", shape=[1])
+            place = paddle.CPUPlace()
+            input_data = np.array([1.5]).astype("float32")
+            label_data = np.array([1.7]).astype("float32")
+
+            output = mse_loss(input,label)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            output_data = exe.run(
+                paddle.static.default_main_program(),
+                feed={"input":input_data, "label":label_data},
+                fetch_list=[output],
+                return_numpy=True)
+            print(output_data)
+            # [array([0.04000002], dtype=float32)]
+
+            # dynamic graph mode
+            paddle.disable_static()
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+            output = mse_loss(input, label)
+            print(output.numpy())
+            # [0.04000002]
+
+    """
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction))
+
+    if not paddle.fluid.framework.in_dygraph_mode():
+        paddle.fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'mse_loss')
+        paddle.fluid.data_feeder.check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'mse_loss')
+
+    if reduction == 'none':
+        return paddle.fluid.layers.square(
+            paddle.fluid.layers.elementwise_sub(input, label), name=name)
+    elif reduction == 'mean':
+        return paddle.mean(
+            paddle.fluid.layers.square(
+                paddle.fluid.layers.elementwise_sub(input, label)),
+            name=name)
+    else:
+        return paddle.sum(paddle.fluid.layers.square(
+            paddle.fluid.layers.elementwise_sub(input, label)),
+                          name=name)
+
+
+def ctc_loss(log_probs,
+             labels,
+             input_lengths,
+             label_lengths,
+             blank=0,
+             reduction='mean'):
+    """
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+
+    Returns:
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
+
+    Examples:
+
+        .. code-block:: python
+
+            # declarative mode
+            import paddle.nn.functional as F
+            import numpy as np
+            import paddle
+
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_tensor(log_probs)
+            labels = paddle.to_tensor(labels)
+            input_lengths = paddle.to_tensor(input_lengths)
+            label_lengths = paddle.to_tensor(label_lengths)
+
+            loss = F.ctc_loss(log_probs, labels,
+                input_lengths,
+                label_lengths,
+                blank=0,
+                reduction='none')
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = F.ctc_loss(log_probs, labels,
+                input_lengths,
+                label_lengths,
+                blank=0,
+                reduction='mean')
+            print(loss.numpy())  #[1.1376063]
+
+    """
+
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+                                    input_lengths, label_lengths)
+
+    loss_out = fluid.layers.squeeze(loss_out, [-1])
+    assert reduction in ['mean', 'sum', 'none']
+    if reduction == 'mean':
+        loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
+                                                      loss_out.dtype))
+    elif reduction == 'sum':
+        loss_out = paddle.sum(loss_out)
+    return loss_out
+
+
+def cross_entropy(input,
+                  label,
+                  weight=None,
+                  ignore_index=-100,
+                  reduction='mean'):
+    """
+    This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
+    and ``NLLLoss`` together.
+
+    It is useful when training a classification problem with ``C`` classes.
+    If provided, the optional argument ``weight`` should be a 1D Variable assigning
+    weight to each of the classes.
+
+    For predictions label, and target label, the loss is calculated as follows.
+
+    .. math::
+
+        loss_j =  -\\text{input[class]} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right), j = 1,..., K
+
+    If weight is not ``None``:
+
+    .. math::
+
+        loss_j =  \\text{weight[class]}(-\\text{input[class]} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right)), j = 1,..., K
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32, float64. Shape is
+	    (N, C), where C is number of classes, and if shape is more than 2D, this
+	    is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+	    (N, D1, D2,..., Dk), k >= 1.
+        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+            to each class and the shape is (C). It has the same dimensions as class
+	    number and the data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default is ``-100``.
+
+    Returns:
+        The tensor variable storing the cross_entropy_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+            input_data = np.random.random([5, 100]).astype("float64")
+            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
+            weight_data = np.random.random([100]).astype("float64")
+            input =  paddle.to_tensor(input_data)
+            label =  paddle.to_tensor(label_data)
+            weight = paddle.to_tensor(weight_data)
+            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
+            print(loss.numpy())
+
+    """
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'cross_entropy_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'cross_entropy_loss')
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in cross_entropy_loss should be 'sum', 'mean' or"
+            " 'none', but received %s, which is not allowed." % reduction)
+
+    #step 1. log_softmax
+    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    if weight is not None and not isinstance(weight, Variable):
+        raise ValueError(
+            "The weight' is not a Variable, please convert to Variable.")
+
+    #step 2. nll_loss
+    input = log_softmax_out
+    helper = LayerHelper('nll_loss', **locals())
+    dtype = helper.input_dtype(input)
+
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'nll_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'nll_loss')
+
+    x_shape = list(input.shape)
+    n = x_shape[0]
+    c = x_shape[1]
+    x_dims = len(x_shape)
+    if x_dims < 2:
+        raise ValueError('Expected 2 or more dimensions (got {})'.format(
+            x_dims))
+    if x_dims != 2 and x_dims != 4:
+        input = reshape(input, shape=[n, c, 1, -1])
+        label = reshape(label, shape=[n, 1, -1])
+        out_shape = [n] + x_shape[2:]
+
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'nll_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'nll_loss')
+    inputs = {'X': input, 'Label': label}
+    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+    if weight is not None:
+        if isinstance(weight, Variable):
+            inputs['Weight'] = weight
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
+    outputs = {'Out': out, 'Total_weight': total_weight}
+
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
+    if x_dims != 2 and x_dims != 4 and reduction == 'none':
+        out = reshape(out, shape=out_shape)
+
+    return out
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 04b031b91ce387c1d8266d53725090d23b592f8c..e9c1a21ecffb1b64cb5ae9e6b802600625cb4685 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -13,16 +13,387 @@
 # limitations under the License.
 
 # TODO: define normalization api  
+import paddle
+import paddle.fluid as fluid
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.framework import in_dygraph_mode, core
+from ...framework import create_parameter
 from ...fluid.layers import l2_normalize  #DEFINE_ALIAS
 from ...fluid.layers import lrn  #DEFINE_ALIAS
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils
 
 __all__ = [
-    #       'batch_norm',
+    'batch_norm',
     #       'data_norm',
-    #       'group_norm',
-    #       'instance_norm',
+    'instance_norm',
     'l2_normalize',
-    #       'layer_norm',
+    'layer_norm',
     'lrn',
+    'normalize',
     #       'spectral_norm'
 ]
+
+
+def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
+    """
+    This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
+
+    .. math::
+
+        y = \frac{x}{ \max\left( \lvert \lvert x \rvert \rvert_p, epsilon\right) }
+    
+    .. math::
+        \lvert \lvert x \rvert \rvert_p = \left(\sum_i {\lvert x_i\rvert^p}  \right)^{1/p}
+
+    where, :math:`\sum_i{\lvert x_i\rvert^p}` is calculated along the ``axis`` dimension.
+
+
+    Args:
+        x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
+        p (float|int, optional): The exponent value in the norm formulation. Default: 2
+        axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension. 
+        epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the output has the same shape and data type with ``x``.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.disable_static()
+            x = np.arange(6, dtype=np.float32).reshape(2,3)
+            x = paddle.to_tensor(x)
+            y = F.normalize(x)
+            print(y.numpy())
+            # [[0.         0.4472136  0.8944272 ]
+            # [0.42426404 0.5656854  0.7071067 ]]
+
+            y = F.normalize(x, p=1.5)
+            print(y.numpy())
+            # [[0.         0.40862012 0.81724024]
+            # [0.35684016 0.4757869  0.5947336 ]]
+
+            y = F.normalize(x, axis=0)
+            print(y.numpy())
+            # [[0.         0.24253564 0.37139067]
+            # [1.         0.97014254 0.9284767 ]]
+    """
+    if in_dygraph_mode():
+        eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
+        out = core.ops.p_norm(x, 'axis', axis, 'porder',
+                              float(p), 'keepdim', True, 'epsilon', epsilon)
+        return x / core.ops.elementwise_max(out, eps)
+
+    check_type(p, 'p', (float, int), 'normalize')
+    check_type(axis, 'axis', (int), 'normalize')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'normalize')
+    if len(x.shape) == 1 and axis != 0 and axis != -1:
+        raise ValueError(
+            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".
+            format(axis))
+
+    attrs = {
+        'axis': axis,
+        'porder': float(p),
+        'keepdim': True,
+        'epsilon': epsilon,
+    }
+    helper = LayerHelper('p_norm', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    eps = out.block.create_var(dtype=out.dtype)
+    paddle.fill_constant([1], out.dtype, epsilon, out=eps)
+    return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+
+
+def batch_norm(x,
+               running_mean,
+               running_var,
+               weight,
+               bias,
+               training=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               data_format="NCHW",
+               name=None):
+    """
+    Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
+    
+    Parameters:
+        x(Tesnor): input value. It's data type should be float32, float64.
+        running_mean(Tensor): running mean.
+        running_var(Tensor): running variance.
+        weight(Tensor): The weight tensor of batch_norm, can not be None.
+        bias(Tensor): The bias tensor of batch_norm can not be None. 
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          x = np.random.seed(123)
+          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          running_mean = np.random.random(size=1).astype('float32')
+          running_variance = np.random.random(size=1).astype('float32')
+          weight_data = np.random.random(size=1).astype('float32')
+          bias_data = np.random.random(size=1).astype('float32')
+          x = paddle.to_tensor(x)
+          rm = paddle.to_tensor(running_mean)
+          rv = paddle.to_tensor(running_variance)
+          w = paddle.to_tensor(weight_data)
+          b = paddle.to_tensor(bias_data)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
+          print batch_norm_out
+    """
+
+    assert len(x.shape) >= 2, "input dim must be larger than 1"
+
+    # we use not training means use_global_status, more details see nn._BatchNormBase
+    use_global_stats = not training
+    # input ad out must share the memory
+    mean_out = running_mean
+    variance_out = running_var
+
+    if in_dygraph_mode():
+        # for dygraph need tuple
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            x, weight, bias, running_mean, running_var, mean_out, variance_out,
+            *attrs)
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'BatchNorm')
+
+    # for static need dict
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
+    inputs = {
+        "X": [x],
+        "Scale": [weight],
+        "Bias": [bias],
+        "Mean": [running_mean],
+        "Variance": [running_var]
+    }
+
+    helper = LayerHelper('batch_norm', **locals())
+
+    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {
+        "Y": [batch_norm_out],
+        "MeanOut": [running_mean],
+        "VarianceOut": [running_var],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return helper.append_activation(batch_norm_out)
+
+
+def layer_norm(x,
+               normalized_shape,
+               weight=None,
+               bias=None,
+               epsilon=1e-05,
+               name=None):
+    """
+    see more detail in paddle.nn.LayerNorm
+    
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+    input_shape = list(x.shape)
+    input_ndim = len(input_shape)
+    normalized_ndim = len(normalized_shape)
+    begin_norm_axis = input_ndim - normalized_ndim
+    if input_ndim < normalized_ndim or input_shape[
+            begin_norm_axis:] != normalized_shape:
+        str_normalized_shape = str(normalized_shape)
+        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
+                         ', expected input with shape [*, ' +
+                         str_normalized_shape[
+                             1:] + ', but got input shape ' + str(input_shape))
+
+    if in_dygraph_mode():
+        pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
+                                            'begin_norm_axis', begin_norm_axis)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
+
+    inputs = dict()
+    inputs['X'] = [x]
+    if weight:
+        inputs['Scale'] = [weight]
+    if bias:
+        inputs['Bias'] = [bias]
+    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+    # create output
+    helper = LayerHelper('layer_norm', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
+def instance_norm(x,
+                  running_mean=None,
+                  running_var=None,
+                  weight=None,
+                  bias=None,
+                  use_input_stats=True,
+                  momentum=0.9,
+                  eps=1e-05,
+                  data_format="NCHW",
+                  name=None):
+    """
+    See more detail in nn.layer.InstanceNorm2d.
+
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        running_mean(Tensor): running mean. Default None.
+        running_var(Tensor): running variance. Default None.
+        weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm_out = paddle.nn.functional.instancenorm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                           "momentum", momentum, "data_format",
+                                           data_format)
+        return out
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+
+    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
+
+    if weight and bias:
+        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+    else:
+        inputs = {"X": [x]}
+
+    helper = LayerHelper('instance_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [instance_norm_out],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    return instance_norm_out
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
old mode 100644
new mode 100755
index 618145fb1fad47e2105edf6186bb4606494d57c9..ca657b8be3e67c7acb795a0f427ca5fe2c57b1f2
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -17,5 +17,1447 @@ from ...fluid.layers import pool2d  #DEFINE_ALIAS
 from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
+from ...fluid import core
+from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
+from ...fluid.layers import utils, LayerHelper
+from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from ...fluid.layers import unsqueeze, squeeze
 
-__all__ = ['pool2d', 'pool3d', 'adaptive_pool2d', 'adaptive_pool3d']
+__all__ = [
+    'pool2d',
+    'pool3d',
+    'avg_pool1d',
+    'max_pool1d',
+    'adaptive_avg_pool1d',
+    'adaptive_max_pool1d',
+    'adaptive_avg_pool2d',
+    'adaptive_avg_pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
+    'max_pool2d',
+    'avg_pool2d',
+    'max_pool3d',
+    'avg_pool3d',
+]
+
+
+def check_input(x, dimension):
+    if len(x.shape) != dimension:
+        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
+                         format(len(x.shape), type(x)))
+
+
+def check_instance(x, x_name, types=(int, float)):
+
+    if not isinstance(x, types):
+        raise ValueError("Excepted {} type for {} but received type: {}. ".
+                         format(types, x_name, type(x)))
+
+
+def update_padding1d(padding, pool_type='avg'):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, list) or isinstance(ele, tuple):
+            return True
+        return False
+
+    if is_list_or_tuple(padding):
+        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
+            return [0, padding[0]]
+        else:
+            raise ValueError(
+                "{}_pool1d() argument 'padding' should contain one int (got {})".
+                format(pool_type, padding.__len__()))
+    else:
+        padding = [0, padding]
+
+    return padding
+
+
+def update_padding2d(padding, data_format):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, list) or isinstance(ele, tuple):
+            return True
+        return False
+
+    if is_list_or_tuple(padding) and len(padding) == 4:
+        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
+            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[2:4]
+            padding = [ele for a_list in padding for ele in a_list]
+        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
+            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[1:3]
+            padding = [ele for a_list in padding for ele in a_list]
+        padding = utils.convert_to_list(padding, 4, 'padding')
+
+        if utils._is_symmetric_padding(padding, 2):
+            padding = [padding[0], padding[2]]
+    else:
+        padding = utils.convert_to_list(padding, 2, 'padding')
+
+    return padding
+
+
+def update_padding3d(padding, data_format):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, (list, tuple)):
+            return True
+        return False
+
+    if is_list_or_tuple(padding) and len(padding) == 5:
+        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
+            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[2:5]
+            padding = [ele for a_list in padding for ele in a_list]
+        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
+            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[1:4]
+            padding = [ele for a_list in padding for ele in a_list]
+        padding = utils.convert_to_list(padding, 6, 'padding')
+        if utils._is_symmetric_padding(padding, 3):
+            padding = [padding[0], padding[2], padding[4]]
+
+    elif is_list_or_tuple(padding) and len(padding) == 6:
+        padding = utils.convert_to_list(padding, 6, 'padding')
+        if utils._is_symmetric_padding(padding, 3):
+            padding = [padding[0], padding[2], padding[4]]
+    else:
+        padding = utils.convert_to_list(padding, 3, 'padding')
+
+    return padding
+
+
+def avg_pool1d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               count_include_pad=True,
+               ceil_mode=False,
+               name=None):
+    """
+
+    This operation applies a 1D average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
+            then the input is implicitly zero-padded on both sides for padding number of points.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
+    check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = [1] + kernel_size
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 1, 'pool_stride')
+        stride = [1] + stride
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0]
+
+    padding = update_padding1d(padding, "avg")
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
+            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
+            data_format)
+        return squeeze(output, [2])
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    return squeeze(pool_out, [2])
+
+
+def max_pool1d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               name=None):
+    """
+
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
+    check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0]
+
+    padding = update_padding1d(padding, 'max')
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    op_type = 'max_pool2d_with_index'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_avg_pool1d(x, output_size, name=None):
+    """
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+        lstart &= floor(i * L_{in} / L_{out})
+
+        lend &= ceil((i + 1) * L_{in} / L_{out})
+
+        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+
+    Returns:
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
+
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+    """
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
+                             'adaptive_pool2d')
+    check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return squeeze(pool_out, [2])
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+        lstart &= floor(i * L_{in} / L_{out})
+
+        lend &= ceil((i + 1) * L_{in} / L_{out})
+
+        Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def max_pool2d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               data_format="NCHW",
+               name=None):
+    """
+    This operation applies 2D max pooling over input feature based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+           stride: stride
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # output.shape [1, 3, 16, 16]
+
+          # for return_indices=True
+          output, max_indices = F.max_pool2d(input,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    padding = update_padding2d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
+
+    op_type = 'max_pool2d_with_index'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
+
+
+def avg_pool2d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
+               name=None):
+    """
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # avg pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          output = F.avg_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # output.shape [1, 3, 16, 16]
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+    pool_padding = update_padding2d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": "avg",
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    if divisor_override is None:
+        return pool_out
+    else:
+        check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+
+
+def max_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               data_format="NCDHW",
+               name=None):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+           $$
+           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of
+                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
+                          of the feature.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          output.shape [1, 3, 16, 16, 16]
+
+          # for return_indices=True
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(input,
+                                        kernel_size = 2,
+                                        stride = 2,
+                                        padding=0,
+                                        return_indices=True)
+          # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0]
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
+    padding = update_padding3d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
+
+    op_type = "max_pool3d_with_index"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": False,
+            "data_format": data_format,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
+
+
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
+                          of the feature.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          import paddle
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          pool3d = paddle.nn.functional.avg_pool3d(
+                                            input,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # pool3d.shape: [1, 3, 16, 16, 16]
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0]
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
+    padding = update_padding3d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override
+
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    if divisor_override is None:
+        return pool_out
+    else:
+        check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override
+
+
+def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+    Args:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+                          The data type can be float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_avg_pool2d')
+    check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    if data_format == "NCHW":
+        in_h, in_w = x.shape[2:4]
+    else:
+        in_h, in_w = x.shape[1:3]
+
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
+                                 'global_pooling', False, 'adaptive', True,
+                                 'data_format', data_format)
+        return output
+
+    l_type = 'pool2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        })
+
+    return pool_out
+
+
+def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+    Args:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+                          The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+            # pool_out.shape is [2, 3, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_avg_pool3d')
+    check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    if data_format == "NCDHW":
+        in_l, in_h, in_w = x.shape[2:5]
+    else:
+        in_l, in_h, in_w = x.shape[1:4]
+
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        output = core.ops.pool3d(x, 'pooling_type', 'avg', 'ksize', output_size,
+                                 'global_pooling', False, 'adaptive', True,
+                                 'data_format', data_format)
+        return output
+
+    l_type = 'pool3d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        })
+
+    return pool_out
diff --git a/python/paddle/nn/functional/rnn.py b/python/paddle/nn/functional/rnn.py
index 520cf44360dc370052375c2c9ef3d0b00fbc05de..b7a97bc5aa303ca507cac37798a3625d498050e3 100644
--- a/python/paddle/nn/functional/rnn.py
+++ b/python/paddle/nn/functional/rnn.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define function of recurrent neural network  
+from paddle.fluid.layers.rnn import rnn, birnn
 
-__all__ = [
-    #       'gru_unit',
-    #       'lstm',
-    #       'lstm_unit'
-]
+__all__ = ['rnn', 'birnn']
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index a2cc8fde5ad7147b7af4765de834508f1f3cc825..1dfdac26e990851ac5f192742acd47fb92633d0d 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -12,9 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ...device import get_cudnn_version
+from ...fluid.framework import core, in_dygraph_mode, Variable
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid import dygraph_utils
+import numpy as np
+
 # TODO: define specitial functions used in computer vision task  
 from ...fluid.layers import affine_channel  #DEFINE_ALIAS
-from ...fluid.layers import affine_grid  #DEFINE_ALIAS
 from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
 from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
 from ...fluid.layers import box_clip  #DEFINE_ALIAS
@@ -44,7 +50,7 @@ from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
 from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
-from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
+# from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
 from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
 from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
 from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
@@ -89,3 +95,313 @@ __all__ = [
     'yolo_box',
     'yolov3_loss'
 ]
+
+
+def affine_grid(theta, out_shape, align_corners=True, name=None):
+    """
+    It generates a grid of (x,y) coordinates using the parameters of
+    the affine transformation that correspond to a set of points where
+    the input feature map should be sampled to produce the transformed
+    output feature map.
+
+    Args:
+        theta (Tensor) - A tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters.
+                           The data type can be float32 or float64.
+        out_shape (Tensor | list | tuple): The shape of target output with format [batch_size, channel, height, width].
+                                             ``out_shape`` can be a Tensor or a list or tuple. The data
+                                             type must be int32.
+        align_corners(bool): Whether to align corners of target feature map and source feature map. Default: True.
+        name(str|None): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`.
+
+    Raises:
+        ValueError: If the type of arguments is not supported.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            paddle.disable_static()
+            # theta shape = [1, 2, 3]
+            theta = np.array([[[-0.7, -0.4, 0.3],
+                               [ 0.6,  0.5, 1.5]]]).astype("float32")
+            theta_t = paddle.to_tensor(theta)
+            y_t = F.affine_grid(
+                    theta_t,
+                    [1, 2, 3, 3],
+                    align_corners=False)
+            print(y_t.numpy())
+            
+            #[[[[ 1.0333333   0.76666665]
+            #   [ 0.76666665  1.0999999 ]
+            #   [ 0.5         1.4333333 ]]
+            #
+            #  [[ 0.5666667   1.1666666 ]
+            #   [ 0.3         1.5       ]
+            #   [ 0.03333333  1.8333334 ]]
+            #
+            #  [[ 0.10000002  1.5666667 ]
+            #   [-0.16666666  1.9000001 ]
+            #   [-0.43333334  2.2333333 ]]]]
+    """
+    helper = LayerHelper('affine_grid')
+
+    if not isinstance(theta, Variable):
+        raise ValueError("The theta should be a Tensor.")
+    check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
+                             'affine_grid')
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None and cudnn_version >= 6000 and align_corners:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
+            isinstance(out_shape, Variable)):
+        raise ValueError("The out_shape should be a list, tuple or Tensor.")
+
+    if in_dygraph_mode():
+        _out_shape = out_shape.numpy().tolist() if isinstance(
+            out_shape, Variable) else out_shape
+        return core.ops.affine_grid(theta, "output_shape", _out_shape,
+                                    "align_corners", align_corners, "use_cudnn",
+                                    use_cudnn)
+
+    out = helper.create_variable_for_type_inference(theta.dtype)
+    ipts = {'Theta': theta}
+    attrs = {"align_corners": align_corners, "use_cudnn": use_cudnn}
+    if isinstance(out_shape, Variable):
+        ipts['OutputShape'] = out_shape
+        check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
+                                 'affine_grid')
+    else:
+        attrs['output_shape'] = out_shape
+
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
+def grid_sample(x,
+                grid,
+                mode='bilinear',
+                padding_mode='zeros',
+                align_corners=True,
+                name=None):
+    """
+    This operation samples input X by using bilinear interpolation or
+    nearest interpolation based on flow field grid, which is usually
+    generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
+    is the concatenation of (x, y) coordinates with shape [N, H, W] each,
+    where x is indexing the 4th dimension (in width dimension) of input
+    data x and y is indexing the 3rd dimension (in height dimension),
+    finally results is the bilinear interpolation or nearest value of 4 nearest corner
+    points. The output tensor shape will be [N, C, H, W].
+    .. code-block:: text
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+        .. code-block:: text
+            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points or nearest interpolate point value
+        by nearest point.
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+        For bilinear interpolation:
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+    Args:
+        x(Tensor): The input tensor, which is a 4-d tensor with shape
+                     [N, C, H, W], N is the batch size, C is the channel
+                     number, H and W is the feature height and width.
+                     The data type is float32 or float64.
+        grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
+                        data type is float32 or float64.
+        mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
+                         Default: 'bilinear'.
+        padding_mode(str, optional) The padding method used when source index
+                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   Default: zeros.
+        align_corners(bool, optional): If `align_corners` is true, it will projects
+                   -1 and 1 to the centers of the corner pixels. Otherwise, it will
+                   projects -1 and 1 to the image edges.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            
+            # shape=[1, 1, 3, 3]
+            x = np.array([[[[-0.6,  0.8, -0.5],
+                            [-0.5,  0.2,  1.2],
+                            [ 1.4,  0.3, -0.2]]]]).astype("float64")
+            
+            # grid shape = [1, 3, 4, 2]
+            grid = np.array(
+                         [[[[ 0.2,  0.3],
+                            [-0.4, -0.3],
+                            [-0.9,  0.3],
+                            [-0.9, -0.6]],
+                           [[ 0.4,  0.1],
+                            [ 0.9, -0.8],
+                            [ 0.4,  0.5],
+                            [ 0.5, -0.2]],
+                           [[ 0.1, -0.8],
+                            [-0.3, -1. ],
+                            [ 0.7,  0.4],
+                            [ 0.2,  0.8]]]]).astype("float64")
+            
+            paddle.disable_static()
+            x = paddle.to_tensor(x)
+            grid = paddle.to_tensor(grid)
+            y_t = F.grid_sample(
+                x,
+                grid,
+                mode='bilinear',
+                padding_mode='border',
+                align_corners=True)
+            print(y_t.numpy())
+            
+            # output shape = [1, 1, 3, 4]
+            # [[[[ 0.34   0.016  0.086 -0.448]
+            #    [ 0.55  -0.076  0.35   0.59 ]
+            #    [ 0.596  0.38   0.52   0.24 ]]]]
+    """
+    helper = LayerHelper("grid_sample", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
+    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
+                             'grid_sampler')
+    if not isinstance(x, Variable):
+        raise ValueError("The x should be a Variable")
+    if not isinstance(grid, Variable):
+        raise ValueError("The grid should be a Variable")
+    _modes = ['bilinear', 'nearest']
+    _padding_modes = ['zeros', 'reflect', 'border']
+    if mode not in _modes:
+        raise ValueError(
+            "The mode of grid sample function should be in {}, but got: {}".
+            format(_modes, mode))
+    if padding_mode not in _padding_modes:
+        raise ValueError(
+            "The padding mode of grid sample function should be in {}, but got: {}".
+            format(_padding_modes, padding_mode))
+
+    if not isinstance(align_corners, bool):
+        raise ValueError("The align corners should be bool, but got: {}".format(
+            align_corners))
+
+    cudnn_version = get_cudnn_version()
+    use_cudnn = False
+    if (cudnn_version is not None
+        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+        use_cudnn = True
+    ipts = {'X': x, 'Grid': grid}
+    attrs = {
+        'mode': mode,
+        'padding_mode': padding_mode,
+        'align_corners': align_corners,
+        'use_cudnn': use_cudnn
+    }
+
+    if in_dygraph_mode():
+        attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
+                 align_corners, 'use_cudnn', use_cudnn)
+        out = getattr(core.ops, 'grid_sampler')(x, grid, *attrs)
+    else:
+        out = helper.create_variable_for_type_inference(x.dtype)
+        helper.append_op(
+            type='grid_sampler',
+            inputs=ipts,
+            attrs=attrs,
+            outputs={'Output': out})
+    return out
+
+
+def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
+    """
+    This API implements pixel shuffle operation.
+    See more details in :ref:`api_nn_vision_PixelShuffle` .
+    Parameters:
+        x(Tensor): 4-D tensor, the data type should be float32 or float64.
+        upscale_factor(int): factor to increase spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+    Returns:
+        Out(tensor): Reshaped tensor according to the new dimension.
+    Raises:
+        ValueError: If the square of upscale_factor cannot divide the channels of input.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            out_var = F.pixel_shuffle(x_var, 3)
+            out = out_var.numpy()
+            print(out.shape) 
+            # (2, 1, 12, 12)
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'pixel_shuffle')
+
+    if not isinstance(upscale_factor, int):
+        raise TypeError("upscale factor must be int type")
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
+                         "But recevie Attr(data_format): {} ".format(
+                             data_format))
+
+    if in_dygraph_mode():
+        return core.ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
+                                      "data_format", data_format)
+
+    helper = LayerHelper("pixel_shuffle", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor,
+               "data_format": data_format})
+    return out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 7173c5b587759f38a6c9b7172c02f326e09033a3..3399e4e34c9e3bc61fde515fc1917deb213f3d0b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -20,6 +20,10 @@ from . import conv
 from . import extension
 from . import activation
 from . import norm
+from . import rnn
+from . import vision
+from . import distance
+from . import transformer
 
 from .activation import *
 from .loss import *
@@ -27,6 +31,10 @@ from .conv import *
 from .extension import *
 from .activation import *
 from .norm import *
+from .rnn import *
+from .vision import *
+
+from .transformer import *
 # from .activation import PReLU        #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
 from .activation import LeakyReLU  #DEFINE_ALIAS
@@ -35,16 +43,45 @@ from .activation import Sigmoid  #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
 from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
+from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
 from .common import Pad2D  #DEFINE_ALIAS
+from .common import ReflectionPad1d  #DEFINE_ALIAS
+from .common import ReplicationPad1d  #DEFINE_ALIAS
+from .common import ConstantPad1d  #DEFINE_ALIAS
+from .common import ReflectionPad2d  #DEFINE_ALIAS
+from .common import ReplicationPad2d  #DEFINE_ALIAS
+from .common import ConstantPad2d  #DEFINE_ALIAS
+from .common import ZeroPad2d  #DEFINE_ALIAS
+from .common import ReplicationPad3d  #DEFINE_ALIAS
+from .common import ConstantPad3d  #DEFINE_ALIAS
+from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
 from .common import UpSample  #DEFINE_ALIAS
-from .conv import Conv2D  #DEFINE_ALIAS
-from .conv import Conv2DTranspose  #DEFINE_ALIAS
-from .conv import Conv3D  #DEFINE_ALIAS
-from .conv import Conv3DTranspose  #DEFINE_ALIAS
+from .common import UpsamplingNearest2d  #DEFINE_ALIAS
+from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
+from .common import Dropout  #DEFINE_ALIAS
+from .common import Dropout2D  #DEFINE_ALIAS
+from .common import Dropout3D  #DEFINE_ALIAS
+from .common import AlphaDropout  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AvgPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AvgPool2d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
+from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .conv import Conv1d  #DEFINE_ALIAS
+from .conv import Conv2d  #DEFINE_ALIAS
+from .conv import Conv3d  #DEFINE_ALIAS
+from .conv import ConvTranspose1d  #DEFINE_ALIAS
+from .conv import ConvTranspose2d  #DEFINE_ALIAS
+from .conv import ConvTranspose3d  #DEFINE_ALIAS
 # from .conv import TreeConv        #DEFINE_ALIAS
 # from .conv import Conv1D        #DEFINE_ALIAS
 from .extension import RowConv  #DEFINE_ALIAS
@@ -56,12 +93,18 @@ from .extension import RowConv  #DEFINE_ALIAS
 # from .learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .learning_rate import PolynomialDecay        #DEFINE_ALIAS
 # from .loss import NCELoss        #DEFINE_ALIAS
+from .loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .loss import CrossEntropyLoss  #DEFINE_ALIAS
 from .loss import MSELoss  #DEFINE_ALIAS
 from .loss import L1Loss  #DEFINE_ALIAS
 from .loss import NLLLoss  #DEFINE_ALIAS
 from .loss import BCELoss  #DEFINE_ALIAS
+from .loss import KLDivLoss  #DEFINE_ALIAS
+from .loss import MarginRankingLoss  #DEFINE_ALIAS
+from .loss import CTCLoss  #DEFINE_ALIAS
+from .loss import SmoothL1Loss  #DEFINE_ALIAS
 from .norm import BatchNorm  #DEFINE_ALIAS
+from .norm import SyncBatchNorm  #DEFINE_ALIAS
 from .norm import GroupNorm  #DEFINE_ALIAS
 from .norm import LayerNorm  #DEFINE_ALIAS
 from .norm import SpectralNorm  #DEFINE_ALIAS
@@ -69,3 +112,6 @@ from .norm import InstanceNorm  #DEFINE_ALIAS
 # from .rnn import RNNCell        #DEFINE_ALIAS
 # from .rnn import GRUCell        #DEFINE_ALIAS
 # from .rnn import LSTMCell        #DEFINE_ALIAS
+
+from .vision import PixelShuffle  #DEFINE_ALIAS
+from .distance import PairwiseDistance  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 02a1d297e83ea4f21b3f1a9cb85b950e5959dc08..c38d6018a2500111280a482aa60d072e65e27742 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -15,19 +15,257 @@
 # TODO: define activation functions of neural network
 
 __all__ = [
-    #       'PReLU',
+    'ELU',
+    'GELU',
+    'Hardshrink',
+    'Tanh',
+    'Hardtanh',
+    'PReLU',
     'ReLU',
+    'ReLU6',
+    'SELU',
     'LeakyReLU',
     'Sigmoid',
-    #       'Softmax',
+    'Softmax',
+    'Softplus',
+    'Softshrink',
+    'Softsign',
+    'Tanhshrink',
+    'LogSigmoid',
     'LogSoftmax',
-    'HSigmoid'
+    'HSigmoid',
 ]
 
 from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
-from .. import functional
+from ...fluid.param_attr import ParamAttr
+from ...fluid.initializer import Constant
+from paddle.framework import get_default_dtype
+from .. import functional as F
+
+
+class ELU(layers.Layer):
+    """
+    ELU Activation.
+
+    .. math::
+    
+        ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+
+    Parameters:
+        alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            m = paddle.nn.ELU(0.2)
+            out = m(x)
+            # [[-0.12642411  6.        ]
+            #  [ 1.          15.6      ]]
+    """
+
+    def __init__(self, alpha=1.0, name=None):
+        super(ELU, self).__init__()
+        self._alpha = alpha
+        self._name = name
+
+    def forward(self, x):
+        return F.elu(x, self._alpha, self._name)
+
+
+class GELU(layers.Layer):
+    """
+    GELU Activation.
+
+    If approximate is True
+
+    .. math::
+
+        GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    else
+
+    .. math::
+
+        GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+
+    Parameters:
+        approximate (bool, optional): Wether to enable approximation. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            
+            m = paddle.nn.GELU()
+            out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
+
+            m = paddle.nn.GELU(True)
+            out = m(x) # [-0.158808 0.345714 0.841192 1.39957]
+    """
+
+    def __init__(self, approximate=False, name=None):
+        super(GELU, self).__init__()
+        self._approximate = approximate
+        self._name = name
+
+    def forward(self, x):
+        return F.gelu(x, self._approximate, self._name)
+
+
+class Hardshrink(layers.Layer):
+    """
+    Hardshrink Activation
+
+    .. math::
+
+        hardshrink(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x > threshold \\\\
+            &x, & & if \\ x < -threshold \\\\
+            &0, & & if \\ others
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            m = paddle.nn.Hardshrink()
+            out = m(x) # [-1., 0., 2.5]
+    """
+
+    def __init__(self, threshold=0.5, name=None):
+        super(Hardshrink, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.hardshrink(x, self._threshold, self._name)
+
+
+class Tanh(layers.Layer):
+    """
+    Tanh Activation.
+
+    .. math::
+        Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Tanh()
+            out = m(x)
+            print(out.numpy())
+            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+    """
+
+    def __init__(self, name=None):
+        super(Tanh, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.tanh(x, self._name)
+
+
+class Hardtanh(layers.Layer):
+    """
+    Hardtanh Activation
+
+    .. math::
+
+        Hardtanh(x)= \\begin{cases}
+                        max, \\text{if } x > max \\\\
+                        min, \\text{if } x < min \\\\
+                        x,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        min (float, optional): The value of min for Hardtanh. Default is -1.
+        max (float, optional): The value of max for Hardtanh. Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            m = paddle.nn.Hardtanh()
+            out = m(x) # # [-1., 0.3, 1.]
+    """
+
+    def __init__(self, min=-1.0, max=1.0, name=None):
+        super(Hardtanh, self).__init__()
+        self._min = min
+        self._max = max
+        self._name = name
+
+    def forward(self, x):
+        return F.hardtanh(x, self._min, self._max, self._name)
 
 
 class HSigmoid(layers.Layer):
@@ -154,7 +392,7 @@ class HSigmoid(layers.Layer):
             [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
 
     def forward(self, input, label, path_table=None, path_code=None):
-        out = functional.hsigmoid(
+        out = F.hsigmoid(
             input,
             label,
             self.weight,
@@ -166,188 +404,660 @@ class HSigmoid(layers.Layer):
         return out
 
 
-class ReLU(layers.Layer):
+class PReLU(layers.Layer):
     """
-	:alias_main: paddle.nn.ReLU
-	:alias: paddle.nn.ReLU,paddle.nn.layer.ReLU,paddle.nn.layer.activation.ReLU
+    PReLU Activation.
+
+    .. math::
+
+        PReLU(x) = max(0, x) + weight * min(0, x)
+
+    Parameters:
+        num_parameters (int, optional): Number of `weight` to learn. The supported values are:
+            1 - a single parameter `alpha` is used for all input channels; 
+            Number of channels - a seperate `alpha` is used for each input channel.
+            Default is 1.
+        init (float, optional): Init value of learnable `weight`. Default is 0.25.
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`. 
+            Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape. Default dtype is float32.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            paddle.set_default_dtype("float64")
+
+            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
+                            [ 3.0, -4.0,  5.0, -6.0],
+                            [-7.0, -8.0,  8.0,  9.0]],
+                            [[ 1.0, -2.0, -3.0,  4.0],
+                            [-5.0,  6.0,  7.0, -8.0],
+                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float64')
+            x = paddle.to_tensor(data)
+            m = paddle.nn.PReLU(1, 0.25)
+            out = m(x)
+            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
+            #    [ 3.  , -1.  ,  5.  , -1.5 ],
+            #    [-1.75, -2.  ,  8.  ,  9.  ]],
+            #   [[ 1.  , -0.5 , -0.75,  4.  ],
+            #    [-1.25,  6.  ,  7.  , -2.  ],
+            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
+    """
+
+    def __init__(self, num_parameters=1, init=0.25, weight_attr=None,
+                 name=None):
+        super(PReLU, self).__init__()
+        self._num_parameters = num_parameters
+        self._init = init
+        self._weight_attr = weight_attr
+        self._name = name
+
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
+            is_bias=False,
+            default_initializer=Constant(self._init))
+
+    def forward(self, x):
+        return F.prelu(x, self._weight)
+
 
+class ReLU(layers.Layer):
+    """
     ReLU Activation.
 
-    .. math:
+    .. math::
 
-        out = max(x, 0)
+        ReLU(x) = max(x, 0)
 
     Parameters:
-        inplace (bool, optional): If inplace is True, the input and output of 
-            ``ReLU`` are the same variable. Otherwise, the input and output of
-            ``ReLU`` are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False.
-    
-    Returns:
-        None
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
     
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import numpy as np
 
-          data = np.array([-2, 0, 1]).astype('float32')
-          my_relu = nn.ReLU()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = my_relu(data)  # [0, 0, 1]
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            m = paddle.nn.ReLU()
+            out = m(x) # [0., 0., 1.]
     """
 
-    def __init__(self, inplace=False):
+    def __init__(self, name=None):
         super(ReLU, self).__init__()
-        self._inplace = inplace
+        self._name = name
 
-    def forward(self, input):
-        return functional.relu(input, self._inplace)
+    def forward(self, x):
+        return F.relu(x, self._name)
 
 
-class LeakyReLU(layers.Layer):
+class ReLU6(layers.Layer):
+    """
+    ReLU6 Activation
+
+    .. math::
+
+        ReLU6(x) = min(max(0,x), 6)
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
+            m = paddle.nn.ReLU6()
+            out = m(x) # [0, 0.3, 6]
+    """
+
+    def __init__(self, name=None):
+        super(ReLU6, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.relu6(x, self._name)
+
+
+class SELU(layers.Layer):
+    """
+    SELU Activation
+
+    .. math::
+
+        SELU(x)= scale *
+                 \\begin{cases}
+                   x, \\text{if } x > 0 \\\\
+                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                 \\end{cases}
+
+    Parameters:
+        scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha(must be no less than zero) for SELU. Default is 1.6732632423543772848170429916717
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
+            m = paddle.nn.SELU()
+            out = m(x) # [[0, 1.050701],[2.101402, 3.152103]]
     """
-	:alias_main: paddle.nn.LeakyReLU
-	:alias: paddle.nn.LeakyReLU,paddle.nn.layer.LeakyReLU,paddle.nn.layer.activation.LeakyReLU
 
+    def __init__(self,
+                 scale=1.0507009873554804934193349852946,
+                 alpha=1.6732632423543772848170429916717,
+                 name=None):
+        super(SELU, self).__init__()
+        self._scale = scale
+        self._alpha = alpha
+        self._name = name
+
+    def forward(self, x):
+        return F.selu(x, self._scale, self._alpha, self._name)
+
+
+class LeakyReLU(layers.Layer):
+    """
     Leaky ReLU Activation.
 
-    .. math:
+    .. math::
 
-        out = max(x, alpha * x)
+        LeakyReLU(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x >= 0 \\\\
+            &negative\_slope * x, & & otherwise \\\\
+            \\end{aligned}
+            \\right. \\\\
 
     Parameters:
-        alpha (float, optional): Slope of the activation function at x < 0. Default: 0.01.
-        inplace (bool, optional): If inplace is True, the input and output of 
-            ``LeakyReLU`` are the same variable. Otherwise, the input and output of
-            ``LeakyReLU`` are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False. Default: False.
+        negative_slope (float, optional): Slope of the activation function at
+            :math:`x < 0` . Default is 0.01.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
     
-    Returns:
-        None
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
     
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
 
-          data = np.array([-2, 0, 1]).astype('float32')
-          lrelu = nn.LeakyReLU()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = lrelu(data)  # [-0.02, 0, 1]
+            m = paddle.nn.LeakyReLU()
+            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            out = m(x)  # [-0.02, 0., 1.]
     """
 
-    def __init__(self, alpha=1e-2, inplace=False):
+    def __init__(self, negative_slope=0.01, name=None):
         super(LeakyReLU, self).__init__()
-        self._alpha = alpha
-        self._inplace = inplace
+        self._negative_slope = negative_slope
+        self._name = name
 
-    def forward(self, input):
-        return functional.leaky_relu(input, self._alpha, self._inplace)
+    def forward(self, x):
+        return F.leaky_relu(x, self._negative_slope, self._name)
 
 
 class Sigmoid(layers.Layer):
     """
-	:alias_main: paddle.nn.Sigmoid
-	:alias: paddle.nn.Sigmoid,paddle.nn.layer.Sigmoid,paddle.nn.layer.activation.Sigmoid
+    this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
+    
+    .. math::
 
-    Sigmoid Activation.
+        Sigmoid(x) = \frac{1}{1 + e^{-x}}
     
-    .. math:
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        output = \frac{1}{1 + e^{-input}}
+    Shape:
+        x: N-D tensor, available dtype is float16, float32, float64.
 
-    Parameters:
-        inplace (bool, optional): If inplace is True, the input and output
-            are the same variable. Otherwise, the input and output
-            are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False.
-    
     Returns:
-        None
+        A callable object of Sigmoid.
     
     Examples:
+
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
           import numpy as np
-          input = fluid.data(name="input", shape=[None, 4])
-          output = nn.Sigmoid()(input)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
+          import paddle
+
+          paddle.disable_static()
           input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
-          output_data = exe.run(feed={"input": input_data},
-                                fetch_list=[output])
-          print(output_data) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+          m = paddle.nn.Sigmoid()
+          x = paddle.to_tensor(input_data)
+          output = m(x)
+          print(output.numpy()) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
     """
 
-    def __init__(self, inplace=False):
+    def __init__(self, name=None):
         super(Sigmoid, self).__init__()
-        self._inplace = inplace
+        self.name = name
 
-    def forward(self, input):
-        return functional.sigmoid(input, self._inplace)
+    def forward(self, x):
+        return F.sigmoid(x, self.name)
 
 
-class LogSoftmax(layers.Layer):
+class Softplus(layers.Layer):
+    """
+    Softplus Activation
+
+    .. math::
+
+        Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+
+    Parameters:
+        beta (float, optional): The value of beta for Softplus. Default is 1
+        threshold (float, optional): The value of threshold for Softplus. Default is 20
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Softplus()
+            out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+    """
+
+    def __init__(self, beta=1, threshold=20, name=None):
+        super(Softplus, self).__init__()
+        self._beta = beta
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.softplus(x, self._beta, self._threshold, self._name)
+
+
+class Softshrink(layers.Layer):
+    """
+    Softshrink Activation
+
+    .. math::
+
+        Softshrink(x)= \\begin{cases}
+                        x - threshold, \\text{if } x > threshold \\\\
+                        x + threshold, \\text{if } x < -threshold \\\\
+                        0,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            m = paddle.nn.Softshrink()
+            out = m(x) # [-0.4, 0, 0, 0.3]
     """
-	:alias_main: paddle.nn.LogSoftmax
-	:alias: paddle.nn.LogSoftmax,paddle.nn.layer.LogSoftmax,paddle.nn.layer.activation.LogSoftmax
 
+    def __init__(self, threshold=0.5, name=None):
+        super(Softshrink, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.softshrink(x, self._threshold, self._name)
+
+
+class Softsign(layers.Layer):
+    """
+    Softsign Activation
+
+    .. math::
+
+        Softsign(x) = \\frac{x}{1 + |x|}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Softsign()
+            out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+    """
+
+    def __init__(self, name=None):
+        super(Softsign, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.softsign(x, self._name)
+
+
+class Tanhshrink(layers.Layer):
+    """
+    Tanhshrink Activation
+
+    .. math::
+
+        Tanhshrink(x) = x - tanh(x)
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Tanhshrink()
+            out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+    """
+
+    def __init__(self, name=None):
+        super(Tanhshrink, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.tanhshrink(x, self._name)
+
+
+class LogSigmoid(layers.Layer):
+    """
+    LogSigmoid Activation.
+    
+    .. math::
+
+        LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, or float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            m = paddle.nn.LogSigmoid()
+            out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+    """
+
+    def __init__(self, name=None):
+        super(LogSigmoid, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.logsigmoid(x, self._name)
+
+
+class Softmax(layers.Layer):
+    """
+    Softmax Activation.
+
+    This operator implements the softmax layer. The calculation process is as follows:
+
+    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
+
+    2. Then ``x`` will be logically flattened to a 2-D matrix. The matrix's second
+    dimension(row length) is the same as the dimension :attr:`axis` of ``x``,
+    and the first dimension(column length) is the product of all other dimensions
+    of ``x``. For each row of the matrix, the softmax operator squashes the
+    K-dimensional(K is the width of the matrix, which is also the size of ``x``'s
+    dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional
+    vector of real values in the range [0, 1] that add up to 1.
+
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
+    are performed to restore the two-dimensional matrix to the same dimension as the ``x`` .
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
+
+    .. math::
+
+        Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+
+    Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+
+          Attrs:
+            axis = -1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+
+        Case 2:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+          Attrs:
+            axis = 1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
+                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
+                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
+                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
+                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
+
+    Parameters:
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                        [3.0, 4.0, 5.0, 6.0],
+                        [7.0, 8.0, 8.0, 9.0]],
+                        [[1.0, 2.0, 3.0, 4.0],
+                        [5.0, 6.0, 7.0, 8.0],
+                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Softmax()
+            out = m(x)
+            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+    """
+
+    def __init__(self, axis=-1, name=None):
+        super(Softmax, self).__init__()
+        self._axis = axis
+        self._dtype = None
+        self._name = name
+
+    def forward(self, x):
+        return F.softmax(x, self._axis, self._dtype, self._name)
+
+
+class LogSoftmax(layers.Layer):
+    """
     This operator implements the log_softmax layer. The calculation process is as follows:
 
     .. math::
 
         Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+                  = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
-        axis (int, optional): The index of dimension to perform softmax calculations, it should be in
-            range :math:`[-1, rank-1]`, while :math:`rank` is the rank of input variable. Default: None. 
-            None and -1 means the last dimension.
-        dtype (np.dtype|core.VarDesc.VarType|str): The desired data type of returned tensor. If specified,
-            the input tensor is casted to dtype before the operation is performed. This is useful for
-            preventing data type overflows. Default: None. Supported dtype: float32 or float64
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of the input Tensor . If ``axis`` < 0, it works the
+            same way as :math:`axis + D` . Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
  
-    Returns:
-        None
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                           [3.0, -4.0, 5.0, -6.0],
+                           [-7.0, -8.0, 8.0, 9.0]],
+                          [[1.0, -2.0, -3.0, 4.0],
+                           [-5.0, 6.0, 7.0, -8.0],
+                           [6.0, 7.0, 8.0, 9.0]]])
+            m = paddle.nn.LogSoftmax()
+            x = paddle.to_tensor(x)
+            out = m(x)
+            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+    """
 
-          data = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                           [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]]).astype('float32')
-          my_log_softnmax = nn.LogSoftmax()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = my_log_softnmax(data)
-              # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-              #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-              #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-              #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-              #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-              #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
-    """
-
-    def __init__(self, axis=None):
+    def __init__(self, axis=-1, name=None):
         super(LogSoftmax, self).__init__()
         self._axis = axis
+        self._name = name
 
-    def forward(self, input):
-        return functional.log_softmax(input, self._axis)
+    def forward(self, x):
+        return F.log_softmax(x, self._axis)
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 45259bea49d42eb07e0e593531a1680359f81a68..a1e6508c67d96e9f6cc077efe6e61d708674b057 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -12,21 +12,125 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the common classes to build a neural network  
+# TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
 from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
-from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
+from ...fluid.framework import _dygraph_tracer
 
 __all__ = [
-    'BilinearTensorProduct', 'Pool2D', 'Embedding', 'Linear', 'UpSample',
-    'Pad2D'
+    'BilinearTensorProduct',
+    'Pool2D',
+    'Embedding',
+    'Linear',
+    'UpSample',
+    'Pad2D',
+    'UpsamplingNearest2d',
+    'UpsamplingBilinear2d',
+    'ReflectionPad1d',
+    'ReplicationPad1d',
+    'ConstantPad1d',
+    'ReflectionPad2d',
+    'ReplicationPad2d',
+    'ConstantPad2d',
+    'ZeroPad2d',
+    'ConstantPad3d',
+    'ReplicationPad3d',
+    'CosineSimilarity',
+    'Dropout',
+    'Dropout2D',
+    'Dropout3D',
+    'Bilinear',
+    'AlphaDropout',
 ]
 
 
+class Linear(layers.Layer):
+    """
+    
+    Fully-connected linear transformation layer:
+
+    .. math::
+
+        Out = {XW + b}
+
+    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+
+    Linear layer takes only one ``Tensor`` input.
+    The Linear layer multiplies input tensor with weight matrix and
+    produces an output Tensor of shape [N, *, `output_dim`],
+    where N is batch size and `*` means any number of additional dimensions.
+    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
+
+    Parameters:
+        in_features(int): The number of input units in this layer.
+        out_features(int): The number of output units in this layer.
+        weight_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
+            weights(Parameter) of this layer. Default: None.
+        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Attributes:
+        **weight** (Parameter): the learnable weights of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle import nn
+          import numpy as np
+
+          data = np.ones((3,1,2), np.float32)
+          place = paddle.CPUPlace()
+          paddle.disable_static(place)
+          data = paddle.to_tensor(data)
+          weight_attr=paddle.framework.ParamAttr(name="linear_weight", learning_rate=1.0,
+          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+          bias_attr=paddle.framework.ParamAttr(name="linear_bias", learning_rate=1.0,
+          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+          linear = nn.Linear(2,2,weight_attr=weight_attr, bias_attr=bias_attr)
+          res = linear(data)  # [3 3 3 3 3 3]
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.name = name
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = F.linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        return out
+
+
 class UpSample(layers.Layer):
     """
     This op resizes a batch of images.
@@ -34,8 +138,7 @@ class UpSample(layers.Layer):
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
     and the resizing only applies on the three dimensions(depth, height and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
+
     Supporting resample methods:
         'linear' : Linear interpolation
         'bilinear' : Bilinear interpolation
@@ -65,7 +168,7 @@ class UpSample(layers.Layer):
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method
+    align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
     Example:
@@ -163,16 +266,16 @@ class UpSample(layers.Layer):
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
     
     Parameters:
-        input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Variable|None): Output shape of image resize
+        size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Variable|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearst', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -196,7 +299,7 @@ class UpSample(layers.Layer):
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
     Raises:
-        TypeError: size should be a list or tuple or Variable.
+        TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
@@ -214,16 +317,18 @@ class UpSample(layers.Layer):
     Examples:
         .. code-block:: python
             import paddle
+            import paddle.nn as nn
             import numpy as np
-            import paddle.fluid.dygraph as dg
-            upsample_op = paddle.nn.UpSample(size=[12,12])
+            paddle.disable_static()
+
             input_data = np.random.rand(2,3,6,10).astype("float32")
-            place = paddle.fluid.CPUPlace()
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                output = upsample_op(input=input)
-                print(output.shape)
-                # [2L, 3L, 12L, 12L]
+            upsample_out  = paddle.nn.UpSample(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
     """
 
     def __init__(self,
@@ -231,8 +336,9 @@ class UpSample(layers.Layer):
                  scale_factor=None,
                  mode='nearest',
                  align_corners=False,
-                 align_mode=1,
-                 data_format='NCHW'):
+                 align_mode=0,
+                 data_format='NCHW',
+                 name=None):
         super(UpSample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -240,16 +346,184 @@ class UpSample(layers.Layer):
         self.align_corners = align_corners
         self.align_mode = align_mode
         self.data_format = data_format
+        self.name = name
 
-    def forward(self, input):
+    def forward(self, x):
         out = F.interpolate(
-            input,
+            x,
             size=self.size,
             scale_factor=self.scale_factor,
             mode=self.mode,
             align_corners=self.align_corners,
             align_mode=self.align_mode,
-            data_format=self.data_format)
+            data_format=self.data_format,
+            name=self.name)
+
+        return out
+
+
+class UpsamplingNearest2d(layers.Layer):
+    """
+    This op upsamples a batch of images, using nearest neighbours' pixel values.
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the upsampling only applies on the two dimensions(height and width).
+
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
+    direction) on input tensor.
+    
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+    
+        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None. Has to match input size if it is a list.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: 'nearest' only support 4-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        ValueError: data_format can only be 'NCHW', 'NHWC'.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            upsample_out  = paddle.nn.UpsamplingNearest2d(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 data_format='NCHW',
+                 name=None):
+        super(UpsamplingNearest2d, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='nearest',
+            align_corners=False,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name)
+
+        return out
+
+
+class UpsamplingBilinear2d(layers.Layer):
+    """
+    This op upsamples a batch of images, using bilinear' pixel values.
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the upsampling only applies on the two dimensions(height and width).
+
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
+    again in the other direction.
+    
+    For details of bilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+    
+        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None. Has to match input size if it is a list.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: 'bilinear' only support 4-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        ValueError: data_format can only be 'NCHW', 'NHWC'.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            upsample_out  = paddle.nn.UpsamplingBilinear2d(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 data_format='NCHW',
+                 name=None):
+        super(UpsamplingBilinear2d, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='bilinear',
+            align_corners=True,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name)
 
         return out
 
@@ -258,12 +532,10 @@ class Pad2D(layers.Layer):
     """
         :alias_main: paddle.nn.Pad2D
         :alias: paddle.nn.Pad2D,paddle.nn.layer.Pad2D,paddle.nn.layer.common.Pad2D
-
     This interface is used to construct a callable object of the ``Pad2D``  class.
     The Pad2D layer pads the input tensor boundaries according to 'paddings' and 'mode'.
     If mode is 'reflect', paddings[0] and paddings[1] must be no greater
     than height-1. And the width dimension has the same condition.
-
     Parameters:
         paddings (int | List[int32]): The padding size. If padding is a int, uses the same 
             padding in all boundaries, if padding is a List, it must contain four integers, 
@@ -278,16 +550,12 @@ class Pad2D(layers.Layer):
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
                            Default is  "NCHW"
-
     Returns: 
         None
-
     Examples:
         .. code-block:: text
-
             Input = [[[[1., 2., 3.],
                        [4., 5., 6.]]]]
-
             Case 0:
                 paddings = [0, 1, 2, 3],
                 mode = 'constant'
@@ -295,24 +563,20 @@ class Pad2D(layers.Layer):
                 Out = [[[[0., 0., 1., 2., 3., 0., 0., 0.],
                          [0., 0., 4., 5., 6., 0., 0., 0.],
                          [0., 0., 0., 0., 0., 0., 0., 0.]]]]
-
             Case 1:
                 paddings = [0, 1, 2, 1],
                 mode = 'reflect'
                 Out = [[[[3., 2., 1., 2., 3., 2.],
                          [6., 5., 4., 5., 6., 5.],
                          [3., 2., 1., 2., 3., 2.]]]]
-
             Case 2:
                 paddings = [0, 1, 2, 1],
                 mode = 'edge'
                 Out = [[[[1., 1., 1., 2., 3., 3.],
                          [4., 4., 4., 5., 6., 6.],
                          [4., 4., 4., 5., 6., 6.]]]]
-
     Code Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             import paddle.nn as nn
             import numpy as np
@@ -342,3 +606,944 @@ class Pad2D(layers.Layer):
             mode=self._mode,
             pad_value=self._pad_value,
             data_format=self._data_format)
+
+
+class Bilinear(layers.Layer):
+    """
+
+    This layer performs bilinear on two inputs.
+
+    .. math::
+
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+
+      out = out + b
+
+    In this formula:
+     - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
+     - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
+     - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`b`: the learned bias, shape is [1, out_features].
+     - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
+
+    Parameters:
+       in1_features (int): The dimension of each first input(`x1`).
+       in2_features (int): The dimension of each second input(`x2`).
+       out_features (int): The dimension of output of this layer.
+       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
+       this layer. The default value is None.
+       bias_attr (ParamAttr, optional): The parameter attribute for the bias
+           of this layer. If it is set to False, no bias will be added to the output units.
+           If it is set to None, the bias is initialized zero. The default value is None.       
+       name (str, optional): The default value is None. Normally there is no need for user
+           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of this layer.
+
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Returns:
+       Tensor: A 2-D Tensor of shape [batch_size, out_features].
+
+    Examples:
+       .. code-block:: python
+
+        import paddle
+        import numpy
+
+        paddle.disable_static()
+        layer1 = numpy.random.random((5, 5)).astype('float32')
+        layer2 = numpy.random.random((5, 4)).astype('float32')
+        bilinear = paddle.nn.Bilinear(
+            in1_features=5, in2_features=4, out_features=1000)
+        result = bilinear(paddle.to_tensor(layer1),
+                        paddle.to_tensor(layer2))     # result shape [5, 1000]
+
+    """
+
+    def __init__(self,
+                 in1_features,
+                 in2_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(Bilinear, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._name = name
+        self._in1_features = in1_features
+        self._in2_features = in2_features
+        self._out_features = out_features
+        self._dtype = self._helper.get_default_dtype()
+
+        weight_shape = [
+            self._out_features, self._in1_features, self._in2_features
+        ]
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=weight_shape,
+            dtype=self._dtype,
+            is_bias=False)
+        bias_shape = [1, self._out_features]
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=bias_shape,
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, x1, x2):
+        return F.bilinear(x1, x2, self.weight, self.bias, self._name)
+
+
+class Dropout(layers.Layer):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training as described in the paper:
+    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_ 
+    The dropout operator randomly sets the outputs of some units to zero, while upscale others
+    according to the given dropout probability.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        axis (int | list): The axis along which the dropout is performed. Default None.
+        mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: N-D tensor.
+        - output: N-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
+        super(Dropout, self).__init__()
+
+        self.p = p
+        self.axis = axis
+        self.mode = mode
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout(
+            input,
+            p=self.p,
+            axis=self.axis,
+            training=self.training,
+            mode=self.mode,
+            name=self.name)
+        return out
+
+
+class Dropout2D(layers.Layer):
+    """
+    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
+    a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+    Dropout2d will help promote independence between feature maps as described in the paper: 
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+
+    See ``paddle.nn.functional.dropout2d`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float, optional): Probability of setting units to zero. Default: 0.5
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCHW`, `NHWC`. The default is `NCHW`. When it is `NCHW`, the data is
+                                     stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: 4-D tensor.
+        - output: 4-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout2D(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, data_format='NCHW', name=None):
+        super(Dropout2D, self).__init__()
+
+        self.p = p
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout2d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name)
+        return out
+
+
+class Dropout3D(layers.Layer):
+    """
+    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
+    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+    Dropout3d will help promote independence between feature maps as described in the paper: 
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+
+    See ``paddle.nn.functional.dropout3d`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCDHW`, `NDHWC`. The default is `NCDHW`. When it is `NCDHW`, the data is
+                                     stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: 5-D tensor.
+        - output: 5-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout3D(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, data_format='NCDHW', name=None):
+        super(Dropout3D, self).__init__()
+
+        self.p = p
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout3d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name)
+        return out
+
+
+class AlphaDropout(layers.Layer):
+    """
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing property. For an input with
+    zero mean and unit standard deviation, the output of Alpha Dropout maintains the original mean and
+    standard deviation of the input. Alpha Dropout fits well to SELU activate function by randomly setting
+    activations to the negative saturation value.
+
+    For more information, please refer to:
+    `Self-Normalizing Neural Networks <https://arxiv.org/abs/1706.02515>`_
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: N-D tensor.
+        - output: N-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.AlphaDropout(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, name=None):
+        super(AlphaDropout, self).__init__()
+        self.p = p
+        self.name = name
+
+    def forward(self, input):
+        out = F.alpha_dropout(
+            input, p=self.p, training=self.training, name=self.name)
+        return out
+
+
+class ReflectionPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReflectionPad1d`` class.
+    Uses reflection of the input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            Out = [[[2. 1. 2. 3. 2. 1.]
+                    [5. 4. 5. 6. 5. 4.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReflectionPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[2. 1. 2. 3. 2. 1.]
+            #   [5. 4. 5. 6. 5. 4.]]]
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super(ReflectionPad1d, self).__init__()
+        self._mode = "reflect"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad1d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            Out = [[[2. 1. 2. 3. 2. 1.]
+                    [5. 4. 5. 6. 5. 4.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[1. 1. 2. 3. 3. 3.]
+            #   [1. 4. 5. 6. 6. 6.]]]
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super(ReplicationPad1d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad1d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            value = 0.0
+            Out = [[[0. 1. 2. 3. 0. 0.]
+                    [0. 4. 5. 6. 0. 0.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[0. 1. 2. 3. 0. 0.]
+            #   [0. 4. 5. 6. 0. 0.]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCL", name=None):
+        super(ConstantPad1d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad2d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            value = 0.0
+            Out = [[[[0. 1. 2. 3. 0.]
+                     [0. 4. 5. 6. 0.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[0. 0. 0. 0.]
+            #    [0. 1. 2. 3.]
+            #    [0. 4. 5. 6.]
+            #    [0. 0. 0. 0.]
+            #    [0. 0. 0. 0.]]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCHW", name=None):
+        super(ConstantPad2d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ZeroPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad2d`` class.
+    Uses 0 to pad the input tensor.
+
+    Parameters:
+        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[0. 1. 2. 3. 0.]
+                     [0. 4. 5. 6. 0.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ZeroPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[0. 0. 0. 0.]
+            #    [0. 1. 2. 3.]
+            #    [0. 4. 5. 6.]
+            #    [0. 0. 0. 0.]
+            #    [0. 0. 0. 0.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ZeroPad2d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad2d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[1. 1. 2. 3. 3.]
+                     [4. 4. 5. 6. 6.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[1. 1. 2. 3.]
+            #    [1. 1. 2. 3.]
+            #    [4. 4. 5. 6.]
+            #    [4. 4. 5. 6.]
+            #    [4. 4. 5. 6.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ReplicationPad2d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReflectionPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReflectionPad2d`` class.
+    Uses reflection of the input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[2. 1. 2. 3. 2.]
+                     [5. 4. 5. 6. 5.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 4, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReflectionPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[ 5.  4.  5.  6.]
+            #    [ 2.  1.  2.  3.]
+            #    [ 5.  4.  5.  6.]
+            #    [ 8.  7.  8.  9.]
+            #    [11. 10. 11. 12.]
+            #    [ 8.  7.  8.  9.]
+            #    [ 5.  4.  5.  6.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ReflectionPad2d, self).__init__()
+        self._mode = "reflect"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad3d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad3d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+            padding = [1, 2, 0, 0, 0, 0]
+            value = 0.0
+            Out = [[[[[0. 1. 2. 3. 0. 0.]
+                      [0. 4. 5. 6. 0. 0.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 1, 2, 3)
+            pad = [1, 0, 1, 2, 0, 0]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad3d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[[0. 0. 0. 0.]
+            #     [0. 1. 2. 3.]
+            #     [0. 4. 5. 6.]
+            #     [0. 0. 0. 0.]
+            #     [0. 0. 0. 0.]]]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCDHW", name=None):
+        super(ConstantPad3d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad3d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad3d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+            padding = [1, 2, 0, 0, 0, 0]
+            Out = [[[[[1. 1. 2. 3. 3. 3.]
+                      [4. 4. 5. 6. 6. 6.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 1, 2, 3)
+            pad = [1, 0, 1, 2, 0, 0]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad3d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[[1. 1. 2. 3.]
+            #     [1. 1. 2. 3.]
+            #     [4. 4. 5. 6.]
+            #     [4. 4. 5. 6.]
+            #     [4. 4. 5. 6.]]]]]
+    """
+
+    def __init__(self, padding, data_format="NCDHW", name=None):
+        super(ReplicationPad3d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class CosineSimilarity(layers.Layer):
+    """
+    This interface is used to compute cosine similarity between x1 and x2 along axis.
+
+    Parameters:
+        axis (int): Dimension of vectors to compute cosine similarity. Default is 1.
+        eps(float): Small value to avoid division by zero. Default is 1e-8.
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            Case 0:
+                x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
+                     [0.48949873 0.5797396  0.65444374 0.66510963]
+                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
+                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                     [0.4427798  0.54136837 0.5276275  0.32394758]
+                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                axis = 1
+                eps = 1e-8
+                Out: [0.5275037  0.8368967  0.75037485 0.9245899]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            np.random.seed(0)
+            x1 = np.random.rand(2,3)
+            x2 = np.random.rand(2,3)
+            x1 = paddle.to_tensor(x1)
+            x2 = paddle.to_tensor(x2)
+
+            cos_sim_func = nn.CosineSimilarity(axis=0)
+            result = cos_sim_func(x1, x2)
+            print(result.numpy())
+            # [0.99806249 0.9817672  0.94987036]
+    """
+
+    def __init__(self, axis=1, eps=1e-8):
+        super(CosineSimilarity, self).__init__()
+        self._axis = axis
+        self._eps = eps
+
+    def forward(self, x1, x2):
+        return F.cosine_similarity(x1, x2, axis=self._axis, eps=self._eps)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 9fb6c9ebc2e404ab477630aae99a6b43d683b20b..7d0e59fb7575c9d15d28e88a462aed4ddba47fb9 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -15,12 +15,12 @@
 # TODO: define classes of convolutional neural network
 
 __all__ = [
-    'Conv2D',
-    'Conv2DTranspose',
-    'Conv3D',
-    'Conv3DTranspose',
-    #       'TreeConv',
-    #       'Conv1D'
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
 ]
 
 import numpy as np
@@ -38,12 +38,270 @@ def _get_default_param_initializer(num_channels, filter_size):
     return Normal(0.0, std, 0)
 
 
-class Conv2D(layers.Layer):
+def _reverse_repeat_list(t, n):
+    """Reverse the order of `t` and repeat each element for `n` times.
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
     """
-	:alias_main: paddle.nn.Conv2D
-	:alias: paddle.nn.Conv2D,paddle.nn.layer.Conv2D,paddle.nn.layer.conv.Conv2D
+    return list(x for x in reversed(t) for _ in range(n))
 
-    This interface is used to construct a callable object of the ``Conv2D`` class.
+
+class _ConvNd(layers.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 transposed,
+                 dims,
+                 stride=1,
+                 padding=0,
+                 padding_mode='zeros',
+                 output_padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(_ConvNd, self).__init__()
+        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        self._param_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._groups = groups
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._data_format = data_format
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                "padding_mode must be one of {}, but got padding_mode='{}'".
+                format(valid_padding_modes, padding_mode))
+
+        if padding_mode in {'reflect', 'replicate', 'circular'
+                            } and not isinstance(padding, np.int):
+            raise TypeError(
+                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
+            )
+
+        self._stride = utils.convert_to_list(stride, dims, 'stride')
+        self._dilation = utils.convert_to_list(dilation, dims, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, dims,
+                                                  'kernel_size')
+        self._padding = padding
+        self._padding_mode = padding_mode
+        self.output_padding = output_padding
+
+        if transposed:
+            filter_shape = [self._in_channels, out_channels // groups
+                            ] + self._kernel_size
+        else:
+            if in_channels % groups != 0:
+                raise ValueError("in_channels must be divisible by groups.")
+
+            if padding_mode in {'reflect', 'replicate', 'circular'}:
+                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                self._reversed_padding_repeated_twice = _reverse_repeat_list(
+                    _paired_padding, 2)
+
+            filter_shape = [out_channels, in_channels // groups
+                            ] + self._kernel_size
+
+        self.weight = self.create_parameter(
+            shape=filter_shape, attr=self._param_attr)
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
+
+
+class Conv1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``Conv1d`` class.
+    For more details, refer to code examples.
+    The convolution1D layer calculates the output based on the input, filter
+    and stride, padding, dilation, groups parameters. Input and
+    Output are in NCL format or NLC format, where N is batch size, C is the number of
+    the feature map, L is the length of the feature map.
+    Filter's shape is [MCK] , where M is the number of output feature map,
+    C is the number of input feature map, K is the size of the kernel. 
+    If the groups is greater than 1, C will equal the number of input feature map divided by the groups.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \\sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a ``Tensor`` with 'NCL' format or 'NLC' format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCK] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, L_{in})`
+          Kernel shape: :math:`(C_{out}, C_{in}, K)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, L_{out})`
+        Where
+        .. math::
+            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+    Parameters:
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of filter. It is as same as the output
+            feature map.
+        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple,
+            it must contain one integer, (kernel_size).
+        stride (int|tuple|list, optional): The stride size. If stride is a tuple, it must
+            contain one integer, (stride_size). Default: 1.
+        padding(int|str|tuple|list, optional): The size of zeros to be padded. It must be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
+            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
+            The default value is 0.
+        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple, it must
+            contain one integer, (dilation_size). Default: 1.
+        groups (int, optional): The groups number of the conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        padding_mode(str, optional): Four modes: 'zeros', 'reflect', 'replicate', 'circular'.
+            When in 'zeros' mode, this op uses zeros to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'zeros'.
+        bias(bool, optional): Whether to use bias. Default: True.
+        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv1d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+    Attribute:
+        **weight** (Parameter): the learnable weights of filter of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Shape:
+        - x: 3-D tensor with shape: (batch, in_channels, length) or (batch, length, in_channels).
+        - output: 3-D tensor with same shape as input x.
+    
+    Raises:
+        None
+    Examples:
+        .. code-block:: python
+          import paddle
+          from paddle.nn import Conv1d
+          import numpy as np
+          x = np.array([[[4, 8, 1, 9],
+            [7, 2, 0, 9],
+            [6, 9, 2, 6]]]).astype(np.float32)
+          w=np.array(
+          [[[9, 3, 4],
+            [0, 0, 7],
+            [2, 5, 6]],
+           [[0, 3, 4],
+            [2, 9, 7],
+            [5, 6, 8]]]).astype(np.float32)
+          paddle.disable_static()
+          x_t = paddle.to_tensor(x)
+          conv = Conv1d(3, 2, 3)
+          conv.weight.set_value(w)
+          y_t = conv(x_t)
+          y_np = y_t.numpy()
+          print(y_np)
+          # [[[133. 238.]
+          #   [160. 211.]]]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 bias=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCL",
+                 name=None):
+        super(Conv1d, self).__init__()
+        assert weight_attr is not False, "param_attr should not be False here."
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._groups = groups
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups.")
+        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
+        self._stride = utils.convert_to_list(stride, 1, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
+        self._padding = padding  # leave it to F.conv1d
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._data_format = data_format
+        self._name = name
+
+        self._padding_mode = padding_mode
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                "padding_mode must be one of {}, but got padding_mode='{}'".
+                format(valid_padding_modes, padding_mode))
+
+        if padding_mode in {'reflect', 'replicate', 'circular'
+                            } and not isinstance(padding, np.int):
+            raise ValueError(
+                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
+            )
+        if not isinstance(padding, str):
+            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
+
+        num_filter_channels = in_channels // groups
+        filter_shape = [self._out_channels, num_filter_channels
+                        ] + self._kernel_size
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=filter_shape,
+            default_initializer=_get_default_param_initializer(
+                self._in_channels, filter_shape))
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels],
+            is_bias=True) if bias else None
+
+    def forward(self, x):
+        padding = 0
+        if self._padding_mode != "zeros":
+            x = F.pad(x,
+                      self._padding,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+        else:
+            padding = self._padding
+
+        out = F.conv1d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            name=self._name)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -59,48 +317,23 @@ class Conv2D(layers.Layer):
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \\sigma (W \\ast X + b)
-
     Where:
-
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output
-            feature map.
-        filter_size (int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by convolution.
+        kernel_size (int|list|tuple): The size of convolution kernel.
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
@@ -108,10 +341,8 @@ class Conv2D(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups (int, optional): The groups number of the Conv2d Layer. According to grouped
@@ -119,129 +350,287 @@ class Conv2D(layers.Layer):
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
+        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filter of this layer.
-
         **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-    
-    Raises:
-        ValueError: if ``use_cudnn`` is not a bool value.
-
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
     Examples:
         .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2D(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 6, 6)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class ConvTranspose1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
+    For more details, refer to code examples.
+    The 1-D convolution transpose layer calculates the output based on the input,
+    filter, and dilation, stride, padding. Input(Input) and output(Output)
+    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
+    L is the length of the feature. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a 3-D Tensor with 'NCL' format or 'NLC' format.
+    * :math:`W`: Kernel value, a 3-D Tensor with 'MCK' format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 3-D Tensor with data format 'NCL' of 'NLC', the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+           L^\prime_{out} &= (L_{in} - 1) * stride - pad_top - pad_bottom + dilation * (L_f - 1) + 1 \\\\
+           L_{out} &\in [ L^\prime_{out}, L^\prime_{out} + stride ]
+
+    Note:
+          The conv1d_transpose can be seen as the backward of the conv1d. For conv1d,
+          when stride > 1, conv1d maps multiple input shape to the same output shape,
+          so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`L_{out} = L^\prime_{out}`;
+          else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
+          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+
+    Args:
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of the filter. It is as same as the output
+            feature map.
+        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple,
+            it must contain one integers, (kernel_size). None if
+            use output size to calculate kernel_size. Default: None. kernel_size and
+            output_size should not be None at the same time.
+        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain one integer, (stride_size).
+            Default: stride = 1.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in two forms:
+             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
+        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
+             If it is a tuple, it must contain one integer. Default: 0.
+        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        bias(bool, optional): Whether to use bias. Default: True.
+        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a tuple, it must contain one integer, (dilation_size).
+            Default: dilation = 1.
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv1d_transpose. If it is set to None or one attribute of ParamAttr, conv1d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv1d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv1d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of filters of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+        - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is
+            "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
+        - output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain one integer, (feature_length). None if use
+            kernel_size, padding, output_padding and stride to calculate output_size.
+            If output_size and kernel_size are specified at the same time, They
+            should follow the formula above. Default: None. output_size and kernel_size
+            should not be None at the same time.
+        - output(Tensor): 3-D tensor with same shape as input x.
+
+    Examples:
+       .. code-block:: python
+
+          import paddle
+          from paddle.nn import ConvTranspose1d
+          import numpy as np
+          
+          paddle.disable_static()
+          # shape: (1, 2, 4)
+          x=np.array([[[4, 0, 9, 7],
+                       [8, 0, 9, 2]]]).astype(np.float32)
+          # shape: (2, 1, 2)
+          y=np.array([[[7, 0]],
+                      [[4, 2]]]).astype(np.float32)
+          x_t = paddle.to_tensor(x)
+          conv = ConvTranspose1d(2, 1, 2)
+          conv.weight.set_value(y)
+          y_t = conv(x_t)
+          y_np = y_t.numpy()
+          print y_np
+          
+          # [[[60. 16. 99. 75.  4.]]]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 output_padding=0,
+                 groups=1,
+                 bias=True,
+                 dilation=1,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
+                 data_format="NCL"):
+        super(ConvTranspose1d, self).__init__()
+        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
+        self._param_attr = weight_attr
+        self._bias_attr = bias_attr
         self._groups = groups
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        self._act = act
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._output_padding = output_padding
         self._data_format = data_format
-        self._dtype = dtype
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        channel_last = (data_format == "NHWC")
-        self._padding = padding  # leave it to F.conv2d
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
+        self._bias = bias
 
-        num_filter_channels = num_channels // groups
-        filter_shape = [self._num_filters, num_filter_channels
-                        ] + self._filter_size
+        self._stride = utils.convert_to_list(stride, 1, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
+        self._padding = padding
 
+        filter_shape = [self._in_channels, out_channels // groups
+                        ] + self._kernel_size
         self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, filter_shape))
+            shape=filter_shape, attr=self._param_attr)
         self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+            attr=self._bias_attr, shape=[self._out_channels],
+            is_bias=True) if self._bias else None
 
-    def forward(self, input):
-        out = F.conv2d(
-            input,
+    def forward(self, x, output_size=None):
+        out = F.conv_transpose1d(
+            x,
             self.weight,
             bias=self.bias,
+            output_size=output_size,
+            output_padding=self._output_padding,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
             data_format=self._data_format)
         return out
 
 
-class Conv2DTranspose(layers.Layer):
+class ConvTranspose2d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv2DTranspose
-	:alias: paddle.nn.Conv2DTranspose,paddle.nn.layer.Conv2DTranspose,paddle.nn.layer.conv.Conv2DTranspose
-
-    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
+    This interface is used to construct a callable object of the ``ConvTranspose2d`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input and output
@@ -256,55 +645,36 @@ class Conv2DTranspose(layers.Layer):
     is applied to the final result.
     The details of convolution transpose layer, please refer to the following explanation and references
     `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     Where:
-
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
     Example:
-
         - Input:
-
           Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
           Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
         - Output:
-
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
         Where
-
         .. math::
-
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
            H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            feature map.
-        filter_size(int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by the convolution.
+        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+            it must contain two integers, (kernel_size_H, kernel_size_W).
+            Otherwise, the kernel will be a square.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -312,10 +682,10 @@ class Conv2DTranspose(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: 1.
-        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
@@ -324,125 +694,94 @@ class Conv2DTranspose(layers.Layer):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
     Examples:
        .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2DTranspose(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.ConvTranspose2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 10, 10)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2DTranspose, self).__init__()
-        assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._groups = groups
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._data_format = data_format
-        self._dtype = dtype
-
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+                 data_format="NCHW"):
+        super(ConvTranspose2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            2,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x, output_size=None):
         if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 2,
-                                                      'output_size')
+            output_padding = self.output_padding
         else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-        self._padding = padding
+            output_padding = 0
 
-        filter_shape = [self._num_channels, num_filters // groups
-                        ] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = F.conv2d_transpose(
-            input,
+        out = F.conv_transpose2d(
+            x,
             self.weight,
             bias=self.bias,
-            output_size=self._output_size,
             padding=self._padding,
+            output_padding=output_padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
+            output_size=output_size,
             data_format=self._data_format)
         return out
 
 
-class Conv3D(layers.Layer):
+class Conv3d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv3D
-	:alias: paddle.nn.Conv3D,paddle.nn.layer.Conv3D,paddle.nn.layer.conv.Conv3D
-
-    **Convlution3D Layer**
-
-    The convolution3D layer calculates the output based on the input, filter
+    **Convlution3d Layer**
+    The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
     Output(Output) are multidimensional tensors with a shape of 
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
@@ -451,49 +790,21 @@ class Conv3D(layers.Layer):
     but adds one dimension(depth). If bias attribution and activation type are
     provided, bias is added to the output of the convolution, and the
     corresponding activation function is applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     In the above equation:
-
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output image channel.
-        filter_size (int|tuple, optional): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square, filter_size_depth = filter_size_height
-            = filter_size_width = filter_size.
-        stride (int|tuple, optional): The stride size. If stride is a tuple, it must
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -503,7 +814,7 @@ class Conv3D(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups (int, optional): The groups number of the Conv3d Layer. According to grouped
@@ -511,7 +822,8 @@ class Conv3D(layers.Layer):
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
@@ -521,120 +833,97 @@ class Conv3D(layers.Layer):
             If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
+    Shape:
+        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
-
     Examples:
         .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3D(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = dg.to_variable(x)
+          conv = nn.Conv3d(4, 6, (3, 3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 6, 6, 6)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  padding=0,
                  stride=1,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 padding_mode='zeros',
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._act = act
-        self._use_cudnn = use_cudnn
-        self._dtype = dtype
-        self._data_format = data_format
-
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-        filter_shape = [num_filters, num_filter_channels] + self._filter_size
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, self._filter_size))
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+                 data_format="NCDHW"):
+        super(Conv3d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            3,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv3d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
 
-    def forward(self, input):
         out = F.conv3d(
-            input,
+            x,
             self.weight,
             bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
             data_format=self._data_format)
         return out
 
 
-class Conv3DTranspose(layers.Layer):
+class ConvTranspose3d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv3DTranspose
-	:alias: paddle.nn.Conv3DTranspose,paddle.nn.layer.Conv3DTranspose,paddle.nn.layer.conv.Conv3DTranspose
-
     **Convlution3D transpose layer**
-
     The convolution3D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW format. Where N is batch size, C is the number of channels,
@@ -646,70 +935,38 @@ class Conv3DTranspose(layers.Layer):
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     In the above equation:
-
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
     Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
-           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
-
     **Note**:
-
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
+          The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
-          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
+          so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
           H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
-
-
+          conv_transpose3d can compute the kernel size automatically.
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        filter_size(int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by the convolution.
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+            it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
+            Otherwise, the kernel will be a square.
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
@@ -717,11 +974,9 @@ class Conv3DTranspose(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            The default value is 1.
-        dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
@@ -730,7 +985,7 @@ class Conv3DTranspose(layers.Layer):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. The default value is None.
@@ -739,109 +994,86 @@ class Conv3DTranspose(layers.Layer):
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
+        output_size(int|list|tuple, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). None if use
+            filter_size, padding, and stride to calculate output_size.
+            if output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
+    Shape:
+        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
-
     Examples:
        .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 10, 10, 10)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3DTranspose, self).__init__()
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._use_cudnn = use_cudnn
-        self._act = act
-        self._dtype = dtype
-        self._data_format = data_format
-
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
+                 data_format="NCDHW"):
+        super(ConvTranspose3d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            3,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x, output_size):
         if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 3,
-                                                      'output_size')
+            output_padding = self.output_padding
         else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
+            output_padding = 0
 
-        filter_shape = [num_channels, num_filters // groups] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = F.conv3d_transpose(
-            input,
+        out = F.conv_transpose3d(
+            x,
             self.weight,
             bias=self.bias,
-            output_size=self._output_size,
             padding=self._padding,
+            output_padding=output_padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
+            output_size=output_size,
             data_format=self._data_format)
         return out
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0917441de3fea640204a3891ed03e9a451e3f0f
--- /dev/null
+++ b/python/paddle/nn/layer/distance.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['PairwiseDistance']
+
+import numpy as np
+
+import paddle
+from ...fluid.dygraph import layers
+from ...fluid.framework import core, in_dygraph_mode
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid.layer_helper import LayerHelper
+
+
+class PairwiseDistance(layers.Layer):
+    """
+    This operator computes the pairwise distance between two vectors. The
+    distance is calculated by p-oreder norm:
+
+    .. math::
+
+        \Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}.
+
+    Parameters:
+        p (float): The order of norm. The default value is 2.
+        epsilon (float, optional): Add small value to avoid division by zero,
+            default value is 1e-6.
+        keepdim (bool, optional): Whether to reserve the reduced dimension
+            in the output Tensor. The result tensor is one dimension less than
+            the result of ``'x-y'`` unless :attr:`keepdim` is True, default
+            value is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        x: :math:`(N, D)` where `D` is the dimension of vector, available dtype
+            is float32, float64.
+        y: :math:`(N, D)`, y have the same shape and dtype as x.
+        out: :math:`(N)`. If :attr:`keepdim` is ``True``, the out shape is :math:`(N, 1)`.
+            The same dtype as input tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([[1., 3.], [3., 5.]]).astype(np.float64)
+            y_np = np.array([[5., 6.], [7., 8.]]).astype(np.float64)
+            x = paddle.to_variable(x_np)
+            y = paddle.to_variable(y_np)
+            dist = paddle.nn.PairwiseDistance()
+            distance = dist(x, y)
+            print(distance.numpy()) # [5. 5.]
+
+    """
+
+    def __init__(self, p=2., epsilon=1e-6, keepdim=False, name=None):
+        super(PairwiseDistance, self).__init__()
+        self.p = p
+        self.epsilon = epsilon
+        self.keepdim = keepdim
+        self.name = name
+        check_type(self.p, 'porder', (float, int), 'PairwiseDistance')
+        check_type(self.epsilon, 'epsilon', (float), 'PairwiseDistance')
+        check_type(self.keepdim, 'keepdim', (bool), 'PairwiseDistance')
+
+    def forward(self, x, y):
+        if in_dygraph_mode():
+            sub = core.ops.elementwise_sub(x, y)
+            return core.ops.p_norm(sub, 'axis', 1, 'porder', self.p, 'keepdim',
+                                   self.keepdim, 'epsilon', self.epsilon)
+
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'PairwiseDistance')
+        check_variable_and_dtype(y, 'y', ['float32', 'float64'],
+                                 'PairwiseDistance')
+        sub = paddle.elementwise_sub(x, y)
+
+        helper = LayerHelper("PairwiseDistance", name=self.name)
+        attrs = {
+            'axis': 1,
+            'porder': self.p,
+            'keepdim': self.keepdim,
+            'epsilon': self.epsilon,
+        }
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type='p_norm', inputs={'X': sub}, outputs={'Out': out}, attrs=attrs)
+
+        return out
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index d858d1352620399fdae477f6a6ca2db620abe2e2..de10e77eb1c000e66a7a914dc94ce39a6268bb61 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -12,20 +12,133 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define loss functions of neural network  
+# TODO: define loss functions of neural network
+import numpy as np
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
+from .. import functional as F
+from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
 
 __all__ = [
-    #       'NCELoss',
+    'BCEWithLogitsLoss',
     'CrossEntropyLoss',
     'MSELoss',
     'L1Loss',
     'NLLLoss',
-    'BCELoss'
+    'BCELoss',
+    'KLDivLoss',
+    'MarginRankingLoss',
+    'CTCLoss',
+    'SmoothL1Loss',
 ]
 
 
+class BCEWithLogitsLoss(fluid.dygraph.Layer):
+    """
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+
+    First this operator calculate loss function as follows:
+
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+
+    Args:
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
+            with length equal to the number of classes. The data type is float32, float64.
+            Default is ``'None'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shapes:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``logit`` , else the shape of output is scalar.
+
+    Returns:
+        A callable object of BCEWithLogitsLoss.
+
+    Examples:
+
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
+            output = bce_logit_loss(logit, label)
+            print(output.numpy())  # [0.45618808]
+
+    """
+
+    def __init__(self,
+                 weight=None,
+                 reduction='mean',
+                 pos_weight=None,
+                 name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in BCEWithLogitsLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+
+        super(BCEWithLogitsLoss, self).__init__()
+        self.weight = weight
+        self.reduction = reduction
+        self.pos_weight = pos_weight
+        self.name = name
+
+    def forward(self, logit, label):
+        out = paddle.nn.functional.binary_cross_entropy_with_logits(
+            logit, label, self.weight, self.reduction, self.pos_weight,
+            self.name)
+        return out
+
+
 class CrossEntropyLoss(fluid.dygraph.Layer):
     """
 	:alias_main: paddle.nn.CrossEntropyLoss
@@ -55,8 +168,8 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
     Parameters:
         input (Variable): Input tensor, the data type is float32, float64. Shape is
 	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1. 
-        label (Variable): Label tensor, the data type is int64. Shape is (N), where each 
+	    is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Variable): Label tensor, the data type is int64. Shape is (N), where each
 	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
 	    (N, D1, D2,..., Dk), k >= 1.
         weight (Variable, optional): Weight tensor, a manual rescaling weight given
@@ -112,7 +225,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                 print(output.numpy())
     """
 
-    def __init__(self, weight=None, reduction='mean', ignore_index=-100):
+    def __init__(self, weight=None, ignore_index=-100, reduction='mean'):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
@@ -130,25 +243,16 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                 " 'none', but received %s, which is not allowed." %
                 self.reduction)
 
-        log_softmax = paddle.nn.LogSoftmax()
-        log_softmax_out = log_softmax(input)
-        if self.weight is not None and not isinstance(self.weight,
-                                                      fluid.framework.Variable):
-            raise ValueError(
-                "The weight' is not a Variable, please convert to Variable.")
-        nll_loss = paddle.nn.loss.NLLLoss(
+        return paddle.nn.functional.cross_entropy(
+            input,
+            label,
             weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
-
-        return nll_loss(log_softmax_out, label)
+            ignore_index=self.ignore_index,
+            reduction=self.reduction)
 
 
 class MSELoss(fluid.dygraph.layers.Layer):
     """
-	:alias_main: paddle.nn.MSELoss
-	:alias: paddle.nn.MSELoss,paddle.nn.layer.MSELoss,paddle.nn.layer.loss.MSELoss
-
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
 
@@ -170,55 +274,34 @@ class MSELoss(fluid.dygraph.layers.Layer):
     where `input` and `label` are `float32` tensors of same shape.
 
     Parameters:
-        input (Variable): Input tensor, the data type is float32,
-        label (Variable): Label tensor, the data type is float32,
         reduction (string, optional): The reduction method for the output,
             could be 'none' | 'mean' | 'sum'.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. 
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned. 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
 
-    Returns:
-        The tensor variable storing the MSE loss of input and label.
-
-    Return type:
-        Variable.
+    Shape:
+        input (Tensor): Input tensor, the data type is float32 or float64
+        label (Tensor): Label tensor, the data type is float32 or float64
+        output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            from paddle import fluid
-            import paddle.fluid.dygraph as dg
 
-            mse_loss = paddle.nn.loss.MSELoss()
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            place = fluid.CPUPlace()
             input_data = np.array([1.5]).astype("float32")
             label_data = np.array([1.7]).astype("float32")
 
-            # declarative mode
-            output = mse_loss(input,label)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            output_data = exe.run(
-                fluid.default_main_program(),
-                feed={"input":input_data, "label":label_data},
-                fetch_list=[output],
-                return_numpy=True)
-            print(output_data)
-            # [array([0.04000002], dtype=float32)]
-
-            # imperative mode
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = mse_loss(input, label)
-                print(output.numpy())
-                # [0.04000002]
+            paddle.disable_static()
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = mse_loss(input, label)
+            print(output.numpy())
+            # [0.04000002]
     """
 
     def __init__(self, reduction='mean'):
@@ -231,10 +314,10 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
     def forward(self, input, label):
         if not fluid.framework.in_dygraph_mode():
-            fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                       ['float32'], 'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                                       ['float32'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'MSELoss')
 
         square_out = fluid.layers.square(
             fluid.layers.elementwise_sub(input, label))
@@ -250,180 +333,159 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
 class L1Loss(fluid.dygraph.Layer):
     """
-	:alias_main: paddle.nn.L1Loss
-	:alias: paddle.nn.L1Loss,paddle.nn.layer.L1Loss,paddle.nn.layer.loss.L1Loss
-
     This interface is used to construct a callable object of the ``L1Loss`` class.
-    The L1Loss layer calculates the L1 Loss of input predictions and target 
-    labels as follows.
+    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
+
+     If `reduction` set to ``'none'``, the loss is:
 
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
     .. math::
-        Out = |input - label|
-    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+        Out = \lvert input - label\rvert
+
+    If `reduction` set to ``'mean'``, the loss is:
+
     .. math::
-        Out = MEAN(|input - label|)
-    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+        Out = MEAN(\lvert input - label\rvert)
+
+    If `reduction` set to ``'sum'``, the loss is:
+
     .. math::
-        Out = SUM(|input - label|)
+        Out = SUM(\lvert input - label\rvert)
+
 
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions.
-    If :attr:`reduction` is ``'none'``, the shape of output loss is [N, *], the same as input.
-    If :attr:`reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1], which means the output is a scalar.
-    
     Parameters:
-        reduction (str, optional): Indicate the reduction to apply to the loss, 
+        reduction (str, optional): Indicate the reduction to apply to the loss,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned; 
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. 
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
             Default is ``'mean'``.
-    Returns:
-        A callable object of L1Loss.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        output (Tensor): The L1 Loss of ``input`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+
     Examples:
         .. code-block:: python
-            # declarative mode
-            import paddle.fluid as fluid
-            import numpy as np
             import paddle
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            l1_loss = paddle.nn.loss.L1Loss(reduction='mean')
-            output = l1_loss(input,label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-    
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-    
-            print(output_data)  # [array([0.2], dtype=float32)]
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                l1_loss = paddle.nn.loss.L1Loss(reduction='mean')
-                output = l1_loss(input,label)
-                print(output.numpy())  # [0.2]
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+
+            l1_loss = paddle.nn.loss.L1Loss()
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [0.35]
+
+            l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [1.4]
+
+            l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [[0.20000005 0.19999999]
+            # [0.2        0.79999995]]
     """
 
-    def __init__(self, reduction='mean'):
+    def __init__(self, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
                 "received %s, which is not allowed." % reduction)
         super(L1Loss, self).__init__()
         self.reduction = reduction
+        self.name = name
 
     def forward(self, input, label):
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-
-        unreduced = fluid.layers.elementwise_sub(input, label, act='abs')
-
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(unreduced)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(unreduced)
-        else:
-            return unreduced
+        return paddle.nn.functional.l1_loss(
+            input, label, self.reduction, name=self.name)
 
 
 class BCELoss(fluid.dygraph.Layer):
     """
-	:alias_main: paddle.nn.BCELoss
-	:alias: paddle.nn.BCELoss,paddle.nn.layer.BCELoss,paddle.nn.layer.loss.BCELoss
-
     This interface is used to construct a callable object of the ``BCELoss`` class.
-    The BCELoss layer measures the binary_cross_entropy loss between input predictions 
-    and target labels. The binary_cross_entropy loss can be described as:
+    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
+    and target labels ``label`` . The binary_cross_entropy loss can be described as:
 
     If :attr:`weight` is set, the loss is:
 
     .. math::
         Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
+
     If :attr:`weight` is None, the loss is:
 
     .. math::
         Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
 
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
+    If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
 
-    .. math::
-        Out = Out
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
 
     .. math::
         Out = MEAN(Out)
+
     If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
 
     .. math::
         Out = SUM(Out)
 
-    Note that the input predictions always be the output of sigmoid, and the target labels 
+    Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
     should be numbers between 0 and 1.
 
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions. If ``reduction`` is ``'none'``, the shape of 
-    output is scalar, else the shape of output is same as input.
-
     Parameters:
-        weight (Variable, optional): A manual rescaling weight given to the loss of each 
-            batch element. If given, has to be a Variable of size nbatch and the data type
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, has to be a Tensor of size nbatch and the data type
             is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size, 
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
             Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): 2-D tensor with shape: (N, *), N is batch_size, `*` means
+            number of additional dimensions. The input ``input`` should always
+            be the output of sigmod.  Available dtype is float32, float64.
+        label (Tensor): 2-D tensor with the same shape as ``input``. The target
+            labels which values should be numbers between 0 and 1. Available
+            dtype is float32, float64.
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``input`` , else the shape of output is scalar.
 
-    Returns: 
+    Returns:
         A callable object of BCELoss.
 
     Examples:
         .. code-block:: python
 
-            # declarative mode
-            import paddle.fluid as fluid
             import numpy as np
             import paddle
-            input = fluid.data(name="input", shape=[3, 1], dtype='float32')
-            label = fluid.data(name="label", shape=[3, 1], dtype='float32')
-            bce_loss = paddle.nn.loss.BCELoss()
-            output = bce_loss(input, label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-    
             input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
             label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-    
-            print(output_data)  # [array([0.65537095], dtype=float32)]
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = bce_loss(input, label)
-                print(output.numpy())  # [0.65537095]
+
+            paddle.disable_static()
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+            bce_loss = paddle.nn.loss.BCELoss()
+            output = bce_loss(input, label)
+            print(output.numpy())  # [0.65537095]
+            paddle.enable_static()
+
     """
 
-    def __init__(self, weight=None, reduction='mean'):
+    def __init__(self, weight=None, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
@@ -432,38 +494,12 @@ class BCELoss(fluid.dygraph.Layer):
         super(BCELoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
+        self.name = name
 
     def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
-
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'bce_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'bce_loss')
-
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        self._helper.append_op(
-            type='bce_loss',
-            inputs={
-                'X': [input],
-                'Label': [label],
-            },
-            outputs={'Out': [out]})
-
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                w = self.weight
-                out = fluid.layers.elementwise_mul(out, w, axis=-1)
-            else:
-                raise ValueError(
-                    "The weight is not a Variable, please convert to Variable.")
-
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(out)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(out)
-        else:
-            return out
+        out = paddle.nn.functional.binary_cross_entropy(
+            input, label, self.weight, self.reduction, self.name)
+        return out
 
 
 class NLLLoss(fluid.dygraph.Layer):
@@ -471,20 +507,20 @@ class NLLLoss(fluid.dygraph.Layer):
 	:alias_main: paddle.nn.NLLLoss
 	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
 
-    This op accepts input and target label and returns negative log likelihood 
+    This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
-     
+
     The input for the loss is epected to contain log-probabilities of
-    each classes. It hs to be a Tensor of size either (batch_size, C) or 
+    each classes. It has to be a Tensor of size either (batch_size, C) or
     (batch_size, C, d1, d2, ..., dK) with K >= 1 for the K-dimensional case.
     The label for the loss should be a class index in the range [0, C-1]
     where C is the number of classes. If ignore_index is specified, the
     specified target value does not contribute to the input gradient.
-    
+
     If the optional argument `weight` is provided, it should be a 1D Tensor
     assigning weight to each of the classed. This is particularly useful
     when you have an unbalanced training set.
- 
+
     The loss is calculated as follows.
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 
@@ -505,106 +541,379 @@ class NLLLoss(fluid.dygraph.Layer):
         \\end{cases}
 
     Parameters:
-        input (Variable): Input tensor, the data type is float32, float64. 
-        label (Variable): Label tensor, the data type is int64_t.
-        weight (Variable, optional): Weight tensor, a manual rescaling weight given
-            to each class. If given, it has to be a Tensor of size `C`. Otherwise,
-            it treated as if having all ones. the data type is 
+        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+            to each class. If given, it has to be a 1D Tensor whose size is `[C, ]`. Otherwise,
+            it treated as if having all ones. the data type is
             float32, float64, Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
-            Default is ``'mean'``.
         ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient.
+        reduction (str, optional): Indicate how to average the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'mean'``, the reduced mean loss is returned;
+            if `reduction` is ``'sum'``, the reduced sum loss is returned;
+            if `reduction` is ``'none'``, no reduction will be apllied.
+            Default is ``'mean'``.
+         name (str, optional): Name for the operation (optional, default is None).
+             For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
+            But in K-dimension situation, the shape is :math:`[N, C, d_1, d_2, ..., d_K]`.
+            The data type is float32, float64.
+        label (Tensor): Label tensor, the shape is :math:`[N,]` or :math:`[N, d_1, d_2, ..., d_K]`.
+            The data type is int64.
+        output (Tensor): the `negative log likelihood loss` between input `x` and `label`.
+            If `reduction` is `'none'`, the shape is `[N, *]`.
+            If `reduction` is `'sum'` or `'mean'`, the shape is `[1]`.
+
+    Examples:
+        .. code-block:: python
+
+                import paddle
+                import numpy as np
+
+                nll_loss = paddle.nn.layer.NLLLoss()
+                log_softmax = paddle.nn.LogSoftmax(axis=1)
+
+                input_np = np.array([[0.88103855, 0.9908683 , 0.6226845 ],
+                                 [0.53331435, 0.07999352, 0.8549948 ],
+                                 [0.25879037, 0.39530203, 0.698465  ],
+                                 [0.73427284, 0.63575995, 0.18827209],
+                                 [0.05689114, 0.0862954 , 0.6325046 ]]).astype(np.float32)
+                label_np = np.array([0, 2, 1, 1, 0]).astype(np.int64)
+
+                place = paddle.CPUPlace()
+                paddle.disable_static(place)
+                input = paddle.to_variable(input_np)
+                log_out = log_softmax(input)
+                label = paddle.to_variable(label_np)
+                result = nll_loss(log_out, label)
+                print(result.numpy()) # [1.0720209]
+
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=-100,
+                 reduction='mean',
+                 name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
+                "'none', but received %s, which is not allowed." % reduction)
+        super(NLLLoss, self).__init__()
+        self._weight = weight
+        self._ignore_index = ignore_index
+        self._reduction = reduction
+        self._name = name
+
+    def forward(self, input, label):
+        return F.nll_loss(
+            input,
+            label,
+            weight=self._weight,
+            ignore_index=self._ignore_index,
+            reduction=self._reduction,
+            name=self._name)
+
+
+class KLDivLoss(fluid.dygraph.Layer):
+    """
+    This interface calculates the Kullback-Leibler divergence loss
+    between Input(X) and Input(Target). Notes that Input(X) is the
+    log-probability and Input(Target) is the probability.
+
+    KL divergence loss is calculated as follows:
+
+    $$l(x, y) = y * (\log(y) - x)$$
+
+    Parameters:
+        reduction (str, optional): Indicate how to average the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            Default is ``'mean'``.
+
+    Shape:
+      - input: (N, *) where * means, any number of additional dimensions.
+      - label: (N, *), same shape as input
+      - output: tensor with shape: (1) by default.
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            import paddle.nn as nn
+
+            paddle.enable_imperative()
+
+            shape = (5, 20)
+            x = np.random.uniform(-10, 10, shape).astype('float32')
+            target = np.random.uniform(-10, 10, shape).astype('float32')
+
+            # 'batchmean' reduction, loss shape will be [N]
+            kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[5]
+
+            # 'mean' reduction, loss shape will be [1]
+            kldiv_criterion = nn.KLDivLoss(reduction='mean')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[1]
+
+            # 'sum' reduction, loss shape will be [1]
+            kldiv_criterion = nn.KLDivLoss(reduction='sum')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[1]
+
+            # 'none' reduction, loss shape is same with X shape
+            kldiv_criterion = nn.KLDivLoss(reduction='none')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[5, 20]
+    """
+
+    def __init__(self, reduction='mean'):
+        super(KLDivLoss, self).__init__()
+        self.reduction = reduction
+
+    def forward(self, input, label):
+        out = paddle.nn.functional.kl_div(input, label, self.reduction)
+        return out
+
+
+class MarginRankingLoss(fluid.dygraph.Layer):
+    """
+
+    This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
+    The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
+    , use the math function as follows.
+
+    .. math::
+        margin\_rank\_loss = max(0, -label * (input - other) + margin)
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'none'``, just return the origin ``margin_rank_loss``.
+
+    Parameters:
+        margin (float, optional): The margin value to add, default value is 0;
+        reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input: N-D Tensor, the shape is [N, *], N is batch size and `*` means any number of additional dimensions., available dtype is float32, float64.
+        other: N-D Tensor, `other` have the same shape and dtype as `input`.
+        label: N-D Tensor, label have the same shape and dtype as `input`.
+        output: If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
 
     Returns:
-        The tensor variable storing the nll_loss.
+        A callable object of MarginRankingLoss.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype("float32"))
+            other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype("float32"))
+            label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype("float32"))
+            margin_rank_loss = paddle.nn.MarginRankingLoss()
+            loss = margin_rank_loss(input, other, label)
+            print(loss.numpy()) # [0.75]
+    """
+
+    def __init__(self, margin=0.0, reduction='mean', name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+        super(MarginRankingLoss, self).__init__()
+        self.margin = margin
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input, other, label):
+        out = paddle.nn.functional.margin_ranking_loss(
+            input, other, label, self.margin, self.reduction, self.name)
+        return out
+
+
+class CTCLoss(fluid.dygraph.Layer):
+    """
+	:alias_main: paddle.nn.CTCLoss
+	:alias: paddle.nn.CTCLoss, paddle.nn.layer.CTCLoss, paddle.nn.layer.loss.CTCLoss
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+
+    Shape:
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+
+    Returns:
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
 
-    Return type: Variable.
-    
     Examples:
 
         .. code-block:: python
 
             # declarative mode
-            import paddle.fluid as fluid
             import numpy as np
             import paddle
 
-            input_np = np.random.random(size=(10, 10)).astype(np.float32)
-            label_np = np.random.randint(0, 10, size=(10,)).astype(np.int64)
-            prog = fluid.Program()
-            startup_prog = fluid.Program()
-            place = fluid.CPUPlace()
-            with fluid.program_guard(prog, startup_prog):
-                input = fluid.data(name='input', shape=[10, 10], dtype='float32')
-                label = fluid.data(name='label', shape=[10], dtype='int64')
-                nll_loss = paddle.nn.loss.NLLLoss()
-                res = nll_loss(input, label)
-
-                exe = fluid.Executor(place)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "label": label_np},
-                    fetch_list=[res])
-            print(static_result)
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_np)
-                label = dg.to_variable(label_np)
-                output = nll_loss(input, label)
-                print(output.numpy())
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_tensor(log_probs)
+            labels = paddle.to_tensor(labels)
+            input_lengths = paddle.to_tensor(input_lengths)
+            label_lengths = paddle.to_tensor(label_lengths)
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels,
+                input_lengths,
+                label_lengths)
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels,
+                input_lengths,
+                label_lengths)
+            print(loss.numpy())  #[1.1376063]
     """
 
-    def __init__(self, weight=None, reduction='mean', ignore_index=-100):
-        super(NLLLoss, self).__init__()
-        self.weight = weight
+    def __init__(self, blank=0, reduction='mean'):
+        super(CTCLoss, self).__init__()
+        self.blank = blank
         self.reduction = reduction
-        self.ignore_index = ignore_index
 
-    def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
+    def forward(self, log_probs, labels, input_lengths, label_lengths):
+        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
+                                             label_lengths, self.blank,
+                                             self.reduction)
 
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'nll_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'nll_loss')
 
-        if self.reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in nll_loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % self.reduction)
-
-        x_shape = list(input.shape)
-        n = x_shape[0]
-        c = x_shape[1]
-        x_dims = len(x_shape)
-        if x_dims < 2:
-            raise ValueError('Expected 2 or more dimensions (got {})'.format(
-                x_dims))
-        if x_dims != 2 and x_dims != 4:
-            input = fluid.layers.reshape(input, shape=[n, c, 1, -1])
-            label = fluid.layers.reshape(label, shape=[n, 1, -1])
-            out_shape = [n] + x_shape[2:]
-
-        inputs = {'X': input, 'Label': label}
-        attrs = {'reduction': self.reduction, 'ignore_index': self.ignore_index}
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                inputs['Weight'] = self.weight
-
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        total_weight = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        outputs = {'Out': out, 'Total_weight': total_weight}
-
-        self._helper.append_op(
-            type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
-        if x_dims != 2 and x_dims != 4 and self.reduction == 'none':
-            out = fluid.layers.reshape(out, shape=out_shape)
+class SmoothL1Loss(fluid.dygraph.Layer):
+    """
+    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
+    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
+    In some cases it can prevent exploding gradients and it is more robust and less
+    sensitivity to outliers. Also known as the Huber loss:
 
-        return out
+    .. math::
+
+         loss(x,y)=\\frac{1}{n}\\sum_{i}z_i
+
+    where z_i is given by:
+
+    .. math::
+
+         \\mathop{z_i}=\\left\\{\\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
+        \\end{array} \\right.
+
+    Parameters:
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the errors need to be to use L1. Errors
+            smaller than delta are minimized with L2. Parameter is ignored for
+            negative/zero values. Default = 1.0
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Call Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label
+            is the same as the shape of input.
+
+    Returns:
+        The tensor variable storing the smooth_l1_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(3,3).astype("float32")
+            label_data = np.random.rand(3,3).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            loss = paddle.nn.SmoothL1Loss()
+            output = loss(input, label)
+            print(output.numpy())
+    """
+
+    def __init__(self, reduction='mean', delta=1.0, name=None):
+        super(SmoothL1Loss, self).__init__()
+        self.reduction = reduction
+        self.delta = delta
+        self.name = name
+
+    def forward(self, input, label):
+        return F.smooth_l1_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            delta=self.delta,
+            name=self.name)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 1beba62c1809ffd94a22712fb24ac43a0ec23ff1..c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -1,4 +1,17 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,10 +30,1085 @@
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+
+#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
+from ...fluid.dygraph import layers
+
+from ...framework import get_default_dtype, set_default_dtype
+from ...fluid.framework import in_dygraph_mode
+
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid import core, dygraph_utils
+
+from ..functional import batch_norm, layer_norm, instance_norm
+
+import numpy as np
+import numbers
+import warnings
+
 __all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm'
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
 ]
+
+
+class _InstanceNormBase(layers.Layer):
+    """
+    This class is based class for InstanceNorm1d, 2d, 3d. 
+
+    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.9,
+                 weight_attr=None,
+                 bias_attr=None,
+                 track_running_stats=False,
+                 data_format="NCHW",
+                 name=None):
+        super(_InstanceNormBase, self).__init__()
+
+        if weight_attr == False or bias_attr == False:
+            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if weight_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_features],
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("InstanceNorm Base error")
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
+
+
+class InstanceNorm1d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCL `[batch, in_channels, length]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm2d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm3d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class GroupNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+    Parameters:
+        num_channels(int): The number of channels of input.
+        num_groups(int): The number of groups that divided from channels.
+        epsilon(float, optional): The small value added to the variance to prevent
+                                  division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        If it is set to None, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
+        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm_out = group_norm(x)
+
+          print(group_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_groups,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_layout='NCHW',
+                 name=None):
+        super(GroupNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._num_channels = num_channels
+        self._num_groups = num_groups
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+        param_shape = [self._num_channels]
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr or False,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+
+        self.bias = self.create_parameter(
+            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self.bias is not None:
+            inputs['Bias'] = self.bias
+        if self.weight is not None:
+            inputs['Scale'] = self.weight
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._num_groups})
+
+        return self._helper.append_activation(group_norm_out, None)
+
+
+class LayerNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    ..  math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
+
+    Parameters:
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+
+        self._normalized_shape = list(normalized_shape)
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        param_shape = [np.prod(self._normalized_shape)]
+
+        if weight_attr is False:
+            self.weight = None
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon)
+
+
+class _BatchNormBase(layers.Layer):
+    """
+    BatchNorm base .
+    """
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 track_running_stats=True,
+                 name=None):
+        super(_BatchNormBase, self).__init__()
+        self._num_features = num_features
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if get_default_dtype() == 'float16':
+            set_default_dtype('float32')
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = (self._weight_attr is False) or (
+            self._weight_attr and self._weight_attr.learning_rate == 0.)
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
+        self.bias.stop_gradient = (self._bias_attr is False) or (
+            self._bias_attr and self._bias_attr.learning_rate == 0.)
+
+        moving_mean_name = None
+        moving_variance_name = None
+
+        if name is not None:
+            moving_mean_name = name + "_mean"
+            moving_variance_name = name + "_variance"
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._data_format = data_format
+        self._in_place = False
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._fuse_with_relu = False
+        self._track_running_stats = track_running_stats
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("BatchNorm Base error")
+
+    def forward(self, input):
+
+        self._check_input_dim(input)
+
+        if not self.training and not self._track_running_stats:
+            raise ValueError(
+                'When inference, expected track_running_stats is True.')
+
+        if self.training and not self._track_running_stats:
+            warnings.warn(
+                "When training, we now always track global mean and variance.")
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format)
+
+
+class BatchNorm1d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+    
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm2d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm3d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class SyncBatchNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in training mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
+
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The formula of normalization is as follows:
+ 
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\\beta` : trainable shift parameter vector 
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
+        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
+             running variance. Default: True.
+
+    Shapes:
+        input: Tensor that the dimension from 2 to 5.
+        output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(x)
+          if paddle.fluid.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1.numpy())
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-05,
+                 momentum=0.9,
+                 track_running_stats=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 name=None):
+        super(SyncBatchNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._num_features = num_features
+        self._data_layout = data_format
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
+
+        if self._track_running_stats == False:
+            warnings.warn(
+                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
+            )
+
+        param_shape = [self._num_features]
+
+        # create parameter
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+    def forward(self, x):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        ### train mode: use mini-batch stats, eval mode: use global stats
+        ### use_global_stats only support False in sync_batch_norm
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     False, "use_global_stats", False, 'trainable_statistics',
+                     False)
+            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance, mean_out,
+                variance_out, *attrs)
+
+            return sync_batch_norm_out
+
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'BatchNorm')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": False,
+            "use_global_stats": False,
+            "trainable_statistics": False,
+        }
+
+        inputs = {
+            "X": [x],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [sync_batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return sync_batch_norm_out
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
new file mode 100755
index 0000000000000000000000000000000000000000..87fa0caec9ee287c42d8308d9da25c6d2fc9b911
--- /dev/null
+++ b/python/paddle/nn/layer/pooling.py
@@ -0,0 +1,877 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ...fluid.layers import utils
+from ...fluid.dygraph import layers
+from ...fluid.layer_helper import LayerHelper
+from .. import functional as F
+
+__all__ = [
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AvgPool1d',
+    'maxPool1d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveAvgPool1d',
+    'AvgPool2d',
+    'MaxPool2d',
+    'AvgPool3d',
+    'MaxPool3d',
+]
+
+
+class AdaptiveAvgPool2d(layers.Layer):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveAvgPool3d(layers.Layer):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AvgPool1d(layers.Layer):
+    """
+    This operation applies a 1D average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
+            then the input is implicitly zero-padded on both sides for padding number of points.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AvgPool1d = nn.AvgPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = AvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 count_include_pad=True,
+                 ceil_mode=False,
+                 name=None):
+        super(AvgPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.name = name
+
+    def forward(self, x):
+        out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
+                           self.count_include_pad, self.ceil_mode, self.name)
+        return out
+
+
+class MaxPool1d(layers.Layer):
+    """
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out
+
+
+class AdaptiveAvgPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
+    """
+
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AvgPool2d(layers.Layer):
+    """
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+
+    Returns: None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
+                 name=None):
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class MaxPool2d(layers.Layer):
+    """
+    This operation applies 2D max pooling over input feature based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+           $$
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: None
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16]
+
+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 data_format="NCHW",
+                 name=None):
+        super(MaxPool2d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_indices=self.return_indices,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class MaxPool3d(layers.Layer):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
+          # for return_indices=True
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding,
+                 return_indices=False,
+                 ceil_mode=False,
+                 data_format="NCDHW",
+                 name=None):
+        super(MaxPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_indices=self.return_indices,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class AvgPool3d(layers.Layer):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 4717609503f7faafc16d8c15e3d404b0d780c3e1..6f1c5f199ac99692840ad3c5cffdb726baf5fa19 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -12,10 +12,1333 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define classes of recurrent neural network  
+import copy
+import collections
+import itertools
+import six
+import math
+import sys
+import warnings
+from functools import partial, reduce
+
+import paddle
+from paddle import framework
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+from paddle.fluid.dygraph import Layer, LayerList
+from paddle.fluid.layers import utils
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.data_feeder import convert_dtype
 
 __all__ = [
-    #       'RNNCell',
-    #       'GRUCell',
-    #       'LSTMCell'
+    'RNNCellBase',
+    'SimpleRNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'RNN',
+    'BiRNN',
+    'SimpleRNN',
+    'LSTM',
+    'GRU',
 ]
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    r"""
+    Split states of RNN network into possibly nested list or tuple of
+    states of each RNN cells of the RNN network.
+
+    Arguments:
+        states (Tensor|tuple|list): the concatenated states for RNN network.
+            When `state_components` is 1, states in a Tensor with shape
+            `(L*D, N, C)` where `L` is the number of layers of the RNN 
+            network, `D` is the number of directions of the RNN network(1 
+            for unidirectional RNNs and 2 for bidirectional RNNs), `N` is 
+            the batch size of the input to the RNN network, `C` is the 
+            hidden size of the RNN network. 
+
+            When `state_components` is larger than 1, `states` is a tuple of 
+            `state_components` Tensors that meet the requirements described 
+            above. 
+            
+            For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs, 
+            `state_components` is 2.
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            `states` above. Defaults to 1.
+    
+    Returns:
+        A nested list or tuple of RNN cell states. 
+        If `bidirectional` is True, it can be indexed twice to get an RNN 
+        cell state. The first index indicates the layer, the second index 
+        indicates the direction.
+        If `bidirectional` is False, it can be indexed once to get an RNN
+        cell state. The index indicates the layer.
+        Note that if `state_components` is larger than 1, an RNN cell state
+        can be indexed one more time to get a tensor of shape(N, C), where 
+        `N` is the batch size of the input to the RNN cell, and `C` is the
+        hidden size of the RNN cell.
+    """
+    if state_components == 1:
+        states = paddle.unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([paddle.unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    r"""
+    Concatenate a possibly nested list or tuple of RNN cell states into a 
+    compact form.
+
+    Arguments:
+        states (list|tuple): a possibly nested list or tuple of RNN cell 
+            states. 
+            If `bidirectional` is True, it can be indexed twice to get an 
+            RNN cell state. The first index indicates the layer, the second 
+            index indicates the direction.
+            If `bidirectional` is False, it can be indexed once to get an RNN
+            cell state. The index indicates the layer.
+            Note that if `state_components` is larger than 1, an RNN cell 
+            state can be indexed one more time to get a tensor of shape(N, C), 
+            where `N` is the batch size of the input to the RNN cell, and 
+            `C` is the hidden size of the RNN cell. 
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            `states` above. Defaults to 1.
+    
+    Returns:
+        Concatenated states for RNN network.
+        When `state_components` is 1, states in a Tensor with shape
+        `(L\*D, N, C)` where `L` is the number of layers of the RNN 
+        network, `D` is the number of directions of the RNN network(1 for 
+        unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch 
+        size of the input to the RNN network, `C` is the hidden size of the 
+        RNN network.
+        
+    """
+    if state_components == 1:
+        return paddle.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [paddle.stack(item) for item in componnets]
+
+
+class RNNCellBase(Layer):
+    r"""
+    RNNCellBase is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0.,
+                           batch_dim_idx=0):
+        r"""
+        Generate initialized states according to provided shape, data type and
+        value.
+        Arguments:
+            batch_ref (Tensor): A tensor, which shape would be used to 
+                determine the batch size, which is used to generate initial 
+                states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is 
+                treated as batch size.
+            shape (list|tuple, optional): A (possibly nested structure of) shape[s], 
+                where a shape is a list/tuple of integer). `-1` (for batch size) 
+                will be automatically prepended if a shape does not starts with 
+                it. If None, property `state_shape` will be used. Defaults to 
+                None.
+            dtype (str|list|tuple, optional): A (possibly nested structure of) 
+                data type[s]. The structure must be same as that of `shape`, 
+                except when all tensors' in states has the same data type, a 
+                single data type can be used. If None and property `cell.state_shape` 
+                is not available, current default floating type of paddle is 
+                used. Defaults to None.
+            init_value (float, optional): A float value used to initialize states. 
+                Defaults to 0.
+            batch_dim_idx (int, optional): An integer indicating which 
+                dimension of the of `batch_ref` represents batch. Defaults to 0.
+        Returns:
+            init_states (Tensor|tuple|list): tensor of the provided shape and 
+                dtype, or list of tensors that each satisfies the requirements,
+                packed in the same structure as `shape` and `type` does.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
+                          seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
+        utils.is_sequence = is_sequence_ori
+
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:
+            states_dtypes = framework.get_default_dtype()
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+
+        init_states = map_structure(
+            lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+
+    @property
+    def state_shape(self):
+        r"""
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is a 
+        list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+
+    @property
+    def state_dtype(self):
+        r"""
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+
+
+class SimpleRNNCell(RNNCellBase):
+    r"""
+    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it 
+    computes the outputs and updates states.
+
+    The formula used is as follows:
+
+    .. math::
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `Finding Structure in Time 
+    <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
+    
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        activation (str, optional): The activation in the SimpleRNN cell. 
+            It can be `tanh` or `relu`. Defaults to `tanh`.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (hidden_size, input_size), input to hidden 
+            weight, corresponding to :math:`W_{ih}` in the formula.
+        weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to 
+            hidden weight, corresponding to :math:`W_{hh}` in the formula.
+        bias_ih (Parameter): shape (hidden_size, ), input to hidden bias, 
+            corresponding to :math:`b_{ih}` in the formula.
+        bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias, 
+            corresponding to :math:`b_{hh}` in the formula.
+    
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+                corresponding to :math:`x_t` in the formula.
+        states (Tensor, optional): shape `[batch_size, hidden_size]`, the
+            previous hidden state, corresponding to :math:`h_{t-1}` in the 
+            formula. When states is None, zero state is used. Defaults to 
+            None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 activation="tanh",
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(SimpleRNNCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        if activation not in ["tanh", "relu"]:
+            raise ValueError(
+                "activation for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(activation))
+        self.activation = activation
+        self._activation_fn = paddle.tanh \
+            if activation == "tanh" \
+            else F.relu
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_h = states
+        i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self._activation_fn(i2h + h2h)
+        return h, h
+
+    @property
+    def state_shape(self):
+        return (self.hidden_size, )
+
+
+class LSTMCell(RNNCellBase):
+    r"""
+    Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
+
+    The formula used is as follows:
+
+    .. math::
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (4 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
+        weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
+        bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
+        bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
+
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+            corresponding to :math:`x_t` in the formula.
+        states (tuple, optional): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the previous hidden state, 
+            corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
+            When states is None, zero state is used. Defaults to None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (tuple): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the new hidden states,
+            corresponding to :math:`h_{t}, c{t}` in the formula.
+
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+            prev_c = paddle.randn((4, 32))
+
+            cell = paddle.nn.LSTMCell(16, 32)
+            y, (h, c) = cell(x, (prev_h, prev_c))
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(LSTMCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (4 * hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (4 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (4 * hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (4 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_hidden, pre_cell = states
+        gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1)
+
+        i = self._gate_activation(chunked_gates[0])
+        f = self._gate_activation(chunked_gates[1])
+        o = self._gate_activation(chunked_gates[3])
+        c = f * pre_cell + i * self._activation(chunked_gates[2])
+        h = o * self._activation(c)
+
+        return h, (h, c)
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of LSTMCell is a tuple with two shapes: 
+        `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be 
+        automatically inserted into shape). These two shapes correspond 
+        to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
+        return ((self.hidden_size, ), (self.hidden_size, ))
+
+
+class GRUCell(RNNCellBase):
+    r"""
+    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size..
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (3 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
+        weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
+        bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
+        bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
+
+    Inputs:
+        inputs (Tensor): A tensor with shape `[batch_size, input_size]`,
+            corresponding to :math:`x_t` in the formula.
+        states (Tensor): A tensor with shape `[batch_size, hidden_size]`.
+            corresponding to :math:`h_{t-1}` in the formula.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.GRUCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRUCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (3 * hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (3 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+
+        pre_hidden = states
+        x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
+        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
+
+        r = self._gate_activation(x_r + h_r)
+        z = self._gate_activation(x_z + h_z)
+        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+
+        return h, h
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to the shape of :math:`h_{t-1}`.
+        """
+        return (self.hidden_size, )
+
+
+class RNN(Layer):
+    r"""
+    Wrapper for RNN, which creates a recurrent neural network with an RNN cell. 
+    It performs :code:`cell.forward()` repeatedly until reaches to the maximum 
+    length of `inputs`.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): A (possibly nested structure of) tensor[s]. The input 
+            sequences. 
+            If time major is True, the shape is `[batch_size, time_steps, input_size]`
+            If time major is False, the shape is [time_steps, batch_size, input_size]`
+            where `input_size` is the input size of the cell.
+        initial_states (Tensor|list|tuple, optional): Tensor of a possibly 
+            nested structure of tensors, representing the initial state for 
+            the rnn cell. If not provided, `cell.get_initial_states` would be 
+            called to produce the initial states. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequences.
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states of the cell. Tensor or 
+            a possibly nested structure of tensors which has the same structure 
+            with intial state. Each tensor in final states has the same shape 
+            and dtype as the corresponding tensor in initial states.
+    
+    Notes:
+        This class is a low level API for wrapping rnn cell into a RNN network.
+        Users should take care of the state of the cell. If `initial_states` is 
+        passed to the `forward` method, make sure that it satisfies the 
+        requirements of the cell.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            rnn = paddle.nn.RNN(cell)
+            outputs, final_states = rnn(inputs, prev_h)
+
+    """
+
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if initial_states is None:
+            initial_states = self.cell.get_initial_states(
+                batch_ref=inputs,
+                dtype=inputs.dtype,
+                batch_dim_idx=self.batch_index)
+
+        final_outputs, final_states = F.rnn(self.cell,
+                                            inputs,
+                                            initial_states=initial_states,
+                                            sequence_length=sequence_length,
+                                            time_major=self.time_major,
+                                            is_reverse=self.is_reverse,
+                                            **kwargs)
+        return final_outputs, final_states
+
+
+class BiRNN(Layer):
+    r"""
+    Wrapper for bidirectional RNN, which builds a bidiretional RNN given the 
+    forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and 
+    backward RNN with coresponding cells separately and concats the outputs 
+    along the last axis.
+
+    Arguments:
+        cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
+        cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): the input sequences of both RNN. 
+            If time_major is True, the shape of is 
+            `[time_steps, batch_size, input_size]`, else the shape is
+            `[batch_size, time_steps, input_size]`, where input_size is the 
+            input size of both cells.
+        initial_states (list|tuple, optional): A tuple/list of the initial 
+            states of the forward cell and backward cell. Defaults to None.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce the initial states for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments. Arguments passed to `forward` 
+            for each cell.
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell. 
+
+    Notes:
+        This class is a low level API for wrapping rnn cells into a BiRNN 
+        network. Users should take care of the states of the cells. 
+        If `initial_states` is passed to the `forward` method, make sure that 
+        it satisfies the requirements of the cells.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+            rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
+
+            inputs = paddle.rand((2, 23, 16))
+            outputs, final_states = rnn(inputs)
+
+    """
+
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        if cell_fw.input_size != cell_bw.input_size:
+            raise ValueError("input size of forward cell({}) does not equals"
+                             "that of backward cell({})".format(
+                                 cell_fw.input_size, cell_bw.input_size))
+        for cell in [self.cell_fw, self.cell_bw]:
+            if not hasattr(cell, "call"):
+                # for non-dygraph mode, `rnn` api uses cell.call
+                cell.call = cell.forward
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
+                                        initial_states, sequence_length,
+                                        self.time_major, **kwargs)
+        return outputs, final_states
+
+
+class RNNMixin(LayerList):
+    r"""
+    A Mixin class for RNN networks. It provides `forward` method for SimpleRNN,
+    LSTM and GRU.
+    """
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, -1,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = paddle.fluid.layers.fill_constant_batch_size_like(
+                    inputs, state_shape, dtype, 0, batch_index, 1)
+            else:
+                initial_states = tuple([
+                    paddle.fluid.layers.fill_constant_batch_size_like(
+                        inputs, state_shape, dtype, 0, batch_index, 1)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = F.dropout(
+                    inputs,
+                    self.dropout,
+                    training=self.training,
+                    mode="upscale_in_train")
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    r"""
+    Multilayer Elman network(SimpleRNN). It takes input sequences and initial 
+    states as inputs, and returns the output sequences and the final states.
+
+    Each layer inside the SimpleRNN maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
+    and new states(:math:`h_{t}`).
+
+    .. math::
+
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        activation (str, optional): The activation in each SimpleRNN cell. It can be 
+            `tanh` or `relu`. Defaults to `tanh`.
+        direction (str, optional): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Defaults to None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Defaults to None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Defaults to None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Defaults to None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.SimpleRNN(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 activation="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, activation,
+                                 weight_ih_attr, weight_hh_attr, bias_ih_attr,
+                                 bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, activation,
+                                     weight_ih_attr, weight_hh_attr,
+                                     bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    r"""
+    Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
+    returns the output sequences and the final states.
+
+    Each layer inside the LSTM maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
+    outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
+
+    .. math::
+
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (tuple, optional): the initial state, a tuple of (h, c), 
+            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`, 
+            If `time_major` is False, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1. 
+        final_states (Tensor): the final state, a tuple of two tensors, h and c. 
+            The shape of each is 
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.LSTM(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            prev_c = paddle.randn((2, 4, 32))
+            y, (h, c) = rnn(x, (prev_h, prev_c))
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                            weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr,
+                                weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    r"""
+    Multilayer GRU. It takes input sequencse and initial states as inputs, and 
+    returns the output sequences and the final states.
+
+    Each layer inside the GRU maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) 
+    and new states(:math:`h_{t}`).
+
+    .. math::
+
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used. 
+            Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.GRU(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size, weight_ih_attr,
+                           weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 2b926b5ab369046fc07c3d3d8cd56431d7f740a7..50a8755ac9f7b0a8e35c60f02a9fb825195ab80f 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -13,4 +13,1107 @@
 # limitations under the License.
 
 # TODO: define the classes of Transformer neural network
-# __all__ = [ ]
+__all__ = [
+    'MultiHeadAttention',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'Transformer',
+]
+
+import copy
+import collections
+
+from ...fluid import layers
+from ...fluid.param_attr import ParamAttr
+from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
+from .. import functional as F
+from ...fluid.layers import utils
+from ...fluid.layers.utils import map_structure
+
+
+def _convert_param_attr_to_list(param_attr, n):
+    """
+    If `param_attr` is a list or tuple, convert every element in it to a
+    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+    construct a list, and rename every one by appending a increasing index
+    suffix to avoid having same names when `param_attr` contains a name.
+
+    Parameters:
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`.
+        n (int): The times to repeat to construct a list when `param_attr`
+            is not a list or tuple.
+
+    Returns:
+        list: A list composed of each including cell's `param_attr`.
+    """
+    if isinstance(param_attr, (list, tuple)):
+        assert len(param_attr) == n, (
+            "length of param_attr should be %d when it is a list/tuple" % n)
+        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    else:
+        param_attrs = []
+        attr = ParamAttr._to_attr(param_attr)
+        for i in range(n):
+            attr_i = copy.deepcopy(attr)
+            if attr.name:
+                attr_i.name = attr_i.name + "_" + str(i)
+            param_attrs.append(attr_i)
+    return param_attrs
+
+
+class MultiHeadAttention(Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :code:`ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr` .
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.q_proj = Linear(
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.k_proj = Linear(
+            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.v_proj = Linear(
+            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.out_proj = Linear(
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+
+    def _prepare_qkv(self, query, key, value, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`.
+            value (Tensor): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`.
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
+        q = self.q_proj(query)
+        q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if isinstance(cache, self.StaticCache):
+            # for encoder-decoder attention in inference and has cached
+            k, v = cache.k, cache.v
+        else:
+            k, v = self.compute_kv(key, value)
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = layers.concat([cache.k, k], axis=2)
+            v = layers.concat([cache.v, v], axis=2)
+            cache = self.Cache(k, v)
+
+        return (q, k, v) if cache is None else (q, k, v, cache)
+
+    def compute_kv(self, key, value):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces. The results are used as key-values pairs for subsequent multiple
+        parallel attention.
+        
+        It is part of calculations in multi-head attention, and is provided as
+        a method to pre-compute and prefetch these results, thus we can use them
+        to construct cache for inference.
+
+        Parameters:
+            key (Tensor): The keys for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, kdim]`. The data type
+                should be float32 or float64.
+            value (Tensor): The values for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, vdim]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: A tuple including transformed keys and values. Their shapes \
+                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
+                and their data types are same as inputs.
+        """
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+    def gen_cache(self, key, value=None, type=Cache):
+        """
+        Generates cache for `forward` usage in inference accroding to arguments.
+        The generated cache is an instance of `MultiHeadAttention.Cache` or an
+        instance of `MultiHeadAttention.StaticCache`.
+
+        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
+        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
+        which are results of linear projection, reshape and transpose calculations
+        in MultiHeadAttention.
+        
+        If the generated cache is an instance of `Cache`, `k` and `v` fields
+        reserve intermediate result tensors of previous positions, and the tensors
+        are incremental among decoding steps, which mostly are used for decoder
+        decoder self attention.
+        
+        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
+        would be used as calculated result tensors on keys an values in `forward`,
+        and the tensors keep unchanged among decoding steps, which are mostly used
+        for decoder-encoder cross attention.
+
+        The cache is generated as follows:
+
+        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
+        results to create an instance of `StaticCache`.
+        
+        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
+        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
+        to create an instance of `Cache`, where `batch_size` is from the first
+        dimension of `key`.
+
+        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
+        an instance of `Cache`.
+
+        Parameters:
+            key (Tensor): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If `value` is None,
+                it is only for batch size and data type reference.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, `key` is only
+                for batch size reference. Default None.
+            type (type): It should be `MultiHeadAttention.StaticCache` or
+                `MultiHeadAttention.Cache` to indicate the cache type to generate.
+        
+        Returns:
+            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
+        """
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.compute_kv(key, value)
+            return self.StaticCache(k, v)
+        elif value is None:  # incremental_state
+            k = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            v = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            return self.Cache(k, v)
+        else:
+            # incremental_state with initial value, mainly for usage like UniLM
+            return self.Cache(key, value)
+
+    def forward(self, query, key, value, attn_mask=None, cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        if cache is None:
+            q, k, v = self._prepare_qkv(query, key, value, cache)
+        else:
+            q, k, v, cache = self._prepare_qkv(query, key, value, cache)
+
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        if attn_mask is not None:
+            # TODO(guosheng): support bool mask
+            product = product + attn_mask
+        weights = layers.softmax(product)
+        if self.dropout:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=self.dropout,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+
+        out = layers.matmul(weights, v)
+
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerEncoderLayer(Layer):
+    """
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output accordingly. If
+    `normalize_before` is True, pre-process is layer normalization and post-precess
+    includes dropout, residual connection. Otherwise, no pre-process and post-precess
+    includes dropout, residual connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :code:`ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
+            The `False` value means the corresponding layer would not have trainable
+            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
+            which means the default bias parameter property is used.
+            
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
+
+        self.self_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.linear1 = Linear(
+            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a Transformer encoder layer on the input.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Tensor: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        # TODO(guosheng): Add cache for encoder for the usage like UniLM
+        src = self.self_attn(src, src, src, src_mask)
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(Layer):
+    """
+    TransformerEncoder is a stack of N encoder layers. 
+
+    Parameters:
+        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of encoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer, TransformerEncoder
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            encoder = TransformerEncoder(encoder_layer, 2)
+            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = LayerList([(encoder_layer if i == 0 else
+                                  type(encoder_layer)(**encoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last encoder
+        layer.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Tensor: The output of Transformer encoder. It is a tensor that \
+                has the same shape and data type as `src`.
+        """
+        output = src
+
+        for mod in self.layers:
+            output = mod(output, src_mask=src_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output accordingly. If `normalize_before` is True,
+    pre-process is layer normalization and post-precess includes dropout, residual
+    connection. Otherwise, no pre-process and post-precess includes dropout, residual
+    connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `weight_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_mask,
+                                   cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
+
+        self.self_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.cross_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[1],
+            bias_attr=bias_attrs[1])
+        self.linear1 = Linear(
+            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.norm3 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout3 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+
+        Parameters:
+            tgt (Tensor): The input of Transformer decoder layer. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache`,
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache.
+                See `TransformerDecoderLayer.gen_cache` for more details. It is
+                only used for inference and should be None for training. Default
+                None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder layer. \
+                Or a tuple if `cache` is not None, except for decoder layer output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
+        else:
+            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
+                                                    cache[0])
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        if cache is None:
+            tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
+        else:
+            tgt, static_cache = self.cross_attn(tgt, memory, memory,
+                                                memory_mask, cache[1])
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt if cache is None else (tgt, (incremental_cache,
+                                                static_cache))
+
+    def gen_cache(self, memory):
+        """
+        Generates cache for `forward` usage. The generated cache is a tuple
+        composed of an instance of `MultiHeadAttention.Cache` and an instance
+        of `MultiHeadAttention.StaticCache`.
+
+        Parameters:
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache` \
+                produced by `self_attn.gen_cache(memory, MultiHeadAttention.Cache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache` \
+                produced by `cross_attn.gen_cache(memory, MultiHeadAttention.StaticCache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`.
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        incremental_cache = self.self_attn.gen_cache(
+            memory, type=self.self_attn.Cache)
+        static_cache = self.cross_attn.gen_cache(
+            memory, memory, type=self.cross_attn.StaticCache)
+        return incremental_cache, static_cache
+
+
+class TransformerDecoder(Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers. 
+
+    Parameters:
+        decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of decoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer, TransformerDecoder
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            decoder = TransformerDecoder(decoder_layer, 2)
+            output = decoder(dec_input,
+                             enc_output,
+                             self_attn_mask,
+                             cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+        self.layers = LayerList([(decoder_layer if i == 0 else
+                                  type(decoder_layer)(**decoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+
+        Parameters:
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder. \
+                Or a tuple if `cache` is not None, except for decoder output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        output = tgt
+        new_caches = []
+        for i, mod in enumerate(self.layers):
+            if cache is None:
+                output = mod(output,
+                             memory,
+                             tgt_mask=tgt_mask,
+                             memory_mask=memory_mask,
+                             cache=None)
+            else:
+                output, new_cache = mod(output,
+                                        memory,
+                                        tgt_mask=tgt_mask,
+                                        memory_mask=memory_mask,
+                                        cache=cache[i])
+                new_caches.append(new_cache)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output if cache is None else (output, new_caches)
+
+    def gen_cache(self, memory, do_zip=False):
+        """
+        Generates cache for `forward` usage. The generated cache is a list, and
+        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
+        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
+        for more details. If `do_zip` is True, apply `zip` on these tuples to get
+        a list with two elements.
+
+
+        Parameters:
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            do_zip (bool, optional): Indicate whether to apply `zip` on the tuples.
+                If True, return a list with two elements. Default False
+
+        Returns:
+            list: It is a list, and each element in the list is a tuple produced \
+                by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \
+                for more details. If `do_zip` is True, apply `zip` on these tuples \
+                and return a list with two elements.
+        """
+        cache = [layer.gen_cache(memory) for layer in self.layers]
+        if do_zip:
+            cache = list(zip(*cache))
+        return cache
+
+
+class Transformer(Layer):
+    """
+    A Transformer model composed of an instance of `TransformerEncoder` and an
+    instance of `TransformerDecoder`. While the embedding layer and output layer
+    are not included.
+
+    Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
+    and see `TransformerEncoder` and `TransformerDecoder` for more details.
+    
+    Users can configurate the model architecture with corresponding parameters.
+    Note the usage of `normalize_before` representing where to apply layer
+    normalization (in pre-process or post-precess of multi-head attention or FFN),
+    and some transformer like models are different on this, such as
+    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
+    The default architecture here places layer normalization in post-process and
+    applies another layer normalization on the output of last encoder/decoder layer.
+
+    Parameters:
+        d_model (int): The expected feature size in the encoder/decoder input
+            and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        num_encoder_layers (int): The number of layers in encoder.
+        num_encoder_layers (int): The number of layers in decoder.
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `weight_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :code:`ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
+        custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
+            Default None
+        custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
+            Default None
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import Transformer
+
+            # src: [batch_size, tgt_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # tgt: [batch_size, src_len, d_model]
+            dec_input = paddle.rand((2, 6, 128))
+            # src_mask: [batch_size, n_head, src_len, src_len]
+            enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
+            dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
+            # memory_mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 6, 4))
+            transformer = Transformer(128, 2, 4, 4, 512)
+            output = transformer(enc_input,
+                                 dec_input,
+                                 enc_self_attn_mask,
+                                 dec_self_attn_mask,
+                                 cross_attn_mask)  # [2, 6, 128]
+    """
+
+    def __init__(self,
+                 d_model=512,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None,
+                 custom_encoder=None,
+                 custom_decoder=None):
+        super(Transformer, self).__init__()
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
+                bias_attr)
+            encoder_norm = LayerNorm(d_model)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                              encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
+                bias_attr)
+            decoder_norm = LayerNorm(d_model)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
+                                              decoder_norm)
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
+        """
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+
+        Returns:
+            Tensor: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder.
+        """
+        memory = self.encoder(src, src_mask=src_mask)
+        output = self.decoder(
+            tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
+        return output
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f360ec02e6d8b59b80db4602776e904cf0b499
--- /dev/null
+++ b/python/paddle/nn/layer/vision.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define specitial functions used in computer vision task 
+
+from ...fluid.dygraph import layers
+from .. import functional
+
+__all__ = ['PixelShuffle']
+
+
+class PixelShuffle(layers.Layer):
+    """
+    
+    PixelShuffle Layer    
+
+    This operator rearranges elements in a tensor of shape [N, C, H, W]
+    to a tensor of shape [N, C/upscale_factor**2, H*upscale_factor, W*upscale_factor],
+    or from shape [N, H, W, C] to [N, H*upscale_factor, W*upscale_factor, C/upscale_factor**2].
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of 1/upscale_factor.
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
+    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
+    by Shi et. al (2016) for more details.
+
+    Parameters:
+
+        upscale_factor(int): factor to increase spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x: 4-D tensor with shape: (N, C, H, W) or (N, H, W, C).
+        - out: 4-D tensor with shape: (N, C/upscale_factor**2, H*upscale_factor, W*upscale_factor) or (N, H*upscale_factor, W*upscale_factor, C/upscale_factor^2).
+
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
+            x_var = paddle.to_tensor(x)
+            pixel_shuffle = nn.PixelShuffle(3)
+            out_var = pixel_shuffle(x_var)
+            out = out_var.numpy()
+            print(out.shape) 
+            # (2, 1, 12, 12)
+
+    """
+
+    def __init__(self, upscale_factor, data_format="NCHW", name=None):
+        super(PixelShuffle, self).__init__()
+
+        if not isinstance(upscale_factor, int):
+            raise TypeError("upscale factor must be int type")
+
+        if data_format not in ["NCHW", "NHWC"]:
+            raise ValueError("Data format should be 'NCHW' or 'NHWC'."
+                             "But recevie data format: {}".format(data_format))
+
+        self._upscale_factor = upscale_factor
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return functional.pixel_shuffle(x, self._upscale_factor,
+                                        self._data_format, self._name)
diff --git a/python/paddle/nn/input.py b/python/paddle/nn/utils/__init__.py
similarity index 81%
rename from python/paddle/nn/input.py
rename to python/paddle/nn/utils/__init__.py
index b5f591f44a9a167dcba8e4e46322ca157a5e48cb..6562ac35e1e3180db671f90188f1304f07864189 100644
--- a/python/paddle/nn/input.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -12,10 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define input placeholders of neural network  
-from ..fluid import data  #DEFINE_ALIAS
-
-__all__ = [
-    'data',
-    #       'Input'
-]
+from . import weight_norm_hook
+from .weight_norm_hook import weight_norm, remove_weight_norm
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad53bf394660f3a7e0e48fdbd5eb530abd0852bb
--- /dev/null
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from ... import fluid
+from ...fluid import dygraph
+from ...fluid import layers as F
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...tensor.math import multiply
+
+__all__ = ['weight_norm', 'remove_weight_norm']
+
+
+def l2_norm(x, axis, epsilon=1e-12, name=None):
+    if len(x.shape) == 1:
+        axis = 0
+    check_variable_and_dtype(x, "X", ("float32", "float64"), "norm")
+
+    helper = LayerHelper("l2_normalize", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        })
+    return F.squeeze(norm, axes=[axis])
+
+
+def norm_except_dim(p, dim):
+    shape = p.shape
+    ndims = len(shape)
+    if dim == -1:
+        return F.sqrt(F.reduce_sum(F.square(p)) + 1e-12)
+    elif dim == 0:
+        p_matrix = F.reshape(p, (shape[0], -1))
+        return l2_norm(p_matrix, axis=1)
+    elif dim == ndims - 1:
+        p_matrix = F.reshape(p, (-1, shape[-1]))
+        return l2_norm(p_matrix, axis=0)
+    else:
+        perm = list(range(ndims))
+        perm[0] = dim
+        perm[dim] = 0
+        p_transposed = F.transpose(p, perm)
+        return norm_except_dim(p_transposed, 0)
+
+
+def _weight_norm(v, g, dim):
+    shape = v.shape
+    ndims = len(shape)
+
+    if dim == -1:
+        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
+    elif dim == 0:
+        p_matrix = F.reshape(v, (shape[0], -1))
+        v_normalized = F.l2_normalize(p_matrix, axis=1)
+        v_normalized = F.reshape(v_normalized, shape)
+    elif dim == ndims - 1:
+        p_matrix = F.reshape(v, (-1, shape[-1]))
+        v_normalized = F.l2_normalize(p_matrix, axis=0)
+        v_normalized = F.reshape(v_normalized, shape)
+    else:
+        perm = list(range(ndims))
+        perm[0] = dim
+        perm[dim] = 0
+        p_transposed = F.transpose(v, perm)
+        transposed_shape = p_transposed.shape
+        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
+        v_normalized = F.l2_normalize(p_matrix, axis=1)
+        v_normalized = F.reshape(v_normalized, transposed_shape)
+        v_normalized = F.transpose(v_normalized, perm)
+    weight = multiply(v_normalized, g, axis=dim if dim is not None else -1)
+    return weight
+
+
+class WeightNorm(object):
+    def __init__(self, name, dim):
+        if dim is None:
+            dim = -1
+        self.name = name
+        self.dim = dim
+
+    def compute_weight(self, layer):
+        g = getattr(layer, self.name + '_g')
+        v = getattr(layer, self.name + '_v')
+        return _weight_norm(v, g, self.dim)
+
+    @staticmethod
+    def apply(layer, name, dim):
+        for k, hook in layer._forward_pre_hooks.items():
+            if isinstance(hook, WeightNorm) and hook.name == name:
+                raise RuntimeError("Cannot register two weight_norm hooks on "
+                                   "the same parameter {}".format(name))
+
+        if dim is None:
+            dim = -1
+
+        fn = WeightNorm(name, dim)
+
+        w = getattr(layer, name)
+        del layer._parameters[name]
+
+        g_var = norm_except_dim(w, dim)
+        v = layer.create_parameter(w.shape, dtype=w.dtype)
+        layer.add_parameter(name + "_v", v)
+        g = layer.create_parameter(g_var.shape, dtype=g_var.dtype)
+        layer.add_parameter(name + '_g', g)
+        with dygraph.no_grad():
+            F.assign(w, v)
+            F.assign(g_var, g)
+        setattr(layer, name, fn.compute_weight(layer))
+
+        layer.register_forward_pre_hook(fn)
+        return fn
+
+    def remove(self, layer):
+        w_var = self.compute_weight(layer)
+        delattr(layer, self.name)
+        del layer._parameters[self.name + '_g']
+        del layer._parameters[self.name + '_v']
+        w = layer.create_parameter(w_var.shape, dtype=w_var.dtype)
+        layer.add_parameter(self.name, w)
+        with dygraph.no_grad():
+            F.assign(w_var, w)
+
+    def __call__(self, layer, inputs):
+        setattr(layer, self.name, self.compute_weight(layer))
+
+
+def weight_norm(layer, name='weight', dim=0):
+    """
+    This weight_norm layer applies weight normalization to a parameter according to the 
+    following formula:
+
+    .. math::
+
+        \mathbf{w} = g \dfrac{v}{\|v\|}
+
+    Weight normalization is a reparameterization of the weight vectors in a neural network that 
+    decouples the magnitude of those weight vectors from their direction. Weight normalization 
+    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter 
+    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction 
+    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper: 
+    `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number 
+              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0, 
+              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4. 
+              If dim is set to None, meaning that all elements will be normalized. Default: 0.
+    
+    Returns:
+        Origin layer with weight norm hook.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          from paddle.nn import Conv2D
+          from paddle.nn.utils import weight_norm
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          conv = Conv2D(3, 5, 3)
+          wn = weight_norm(conv)
+          print(conv.weight_g.shape)
+          # [5]
+          print(conv.weight_v.shape)
+          # [5, 3, 3, 3]
+    """
+    WeightNorm.apply(layer, name, dim)
+    return layer
+
+
+def remove_weight_norm(layer, name='weight'):
+    """
+    remove weight normalization from layer.
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+
+    Returns:
+        Origin layer without weight norm
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          from paddle.nn import Conv2D
+          from paddle.nn.utils import weight_norm, remove_weight_norm
+
+          paddle.disable_static()
+          conv = Conv2D(3, 5, 3)
+          wn = weight_norm(conv)
+          remove_weight_norm(conv)
+          print(conv.weight_g)
+          # AttributeError: 'Conv2D' object has no attribute 'weight_g'
+    """
+    for k, hook in layer._forward_pre_hooks.items():
+        if isinstance(hook, WeightNorm) and hook.name == name:
+            hook.remove(layer)
+            del layer._forward_pre_hooks[k]
+            return layer
+
+    raise ValueError("weight_norm of '{}' not found in {}".format(name, layer))
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 4dc3cf397aea59f3fedfc86bff7a77556a6a63a7..49314c9832dd389411dffb3f498b34d09337a3f0 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -14,21 +14,32 @@
 
 __all__ = [
     'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
-    'Adamax', 'AdamaxOptimizer', 'AdamOptimizer', 'DecayedAdagrad',
-    'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd',
-    'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
-    'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer',
-    'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
-    'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD',
-    'SGDOptimizer'
+    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
+    'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
+    'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
+    'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
+    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
+    'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer',
+    '_LRScheduler', 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR',
+    'PolynomialLR', 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR',
+    'LambdaLR', 'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 
 
-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Adam, Adamax, Dpsgd, DecayedAdagrad, \
-            Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, \
-            AdamOptimizer, AdamaxOptimizer, DpsgdOptimizer, \
-            DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \
-            AdadeltaOptimizer, ModelAverage, LarsMomentum, \
-            LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \
+from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
+            Ftrl, Adadelta, \
+            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+            DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
+            ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
             ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
-            RecomputeOptimizer
+            RecomputeOptimizer, LarsMomentumOptimizer
+
+from .optimizer import Optimizer
+from .adam import Adam
+from .adamw import AdamW
+from .adamax import Adamax
+from .rmsprop import RMSProp
+
+from . import lr_scheduler
+from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
+            LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da8053fe8a3495f5d3188a737638531347de648
--- /dev/null
+++ b/python/paddle/optimizer/adam.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["Adam"]
+
+
+class Adam(Optimizer):
+    """
+    The Adam optimizer uses an optimization described at the end
+    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
+    it can dynamically adjusts the learning rate of each parameter using
+    the 1st moment estimates and the 2nd moment estimates of the gradient.
+    
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+        .. code-block:: python
+
+            # Adam with beta1/beta2 as Tensor and weight_decay as float
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Adam, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adam"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(self._moment1_acc_str, p)
+            self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=0.9 if isinstance(self._beta1, Variable) \
+                        else self._beta1,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                fill_value=0.999 if isinstance(self._beta2, Variable) \
+                        else self._beta2,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+        # create the adam optimize op
+
+        if framework.in_dygraph_mode():
+            _beta1 = self._beta1 if not isinstance(
+                self._beta1, Variable) else self._beta1.numpy().item(0)
+            _beta2 = self._beta2 if not isinstance(
+                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _, _, _, _, _ = core.ops.adam(
+                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
+                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
+                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
+                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
+                1000, 'beta1', _beta1, 'beta2', _beta2)
+
+            return None
+
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "LearningRate": [lr],
+            "Moment1": [moment1],
+            "Moment2": [moment2],
+            "Beta1Pow": [beta1_pow_acc],
+            "Beta2Pow": [beta2_pow_acc]
+        }
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "Moment1Out": [moment1],
+            "Moment2Out": [moment2],
+            "Beta1PowOut": [beta1_pow_acc],
+            "Beta2PowOut": [beta2_pow_acc],
+        }
+        attrs = {
+            "epsilon": self._epsilon,
+            "lazy_mode": self._lazy_mode,
+            "min_row_size_to_use_multithread": 1000
+        }
+
+        if isinstance(self._beta1, Variable):
+            inputs['Beta1Tensor'] = self._beta1
+        else:
+            attrs['beta1'] = self._beta1
+        if isinstance(self._beta2, Variable):
+            inputs['Beta2Tensor'] = self._beta2
+        else:
+            attrs['beta2'] = self._beta2
+
+        adam_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return adam_op
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a78b17cbba55c1ee90a2708f6c163940158a51
--- /dev/null
+++ b/python/paddle/optimizer/adamax.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adamax"]
+
+
+class Adamax(Optimizer):
+    """
+    The Adamax optimizer is implemented based on the Adamax Optimization 
+    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
+    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
+    which makes the learning rate update algorithm more stable and simple.
+
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+
+        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+
+        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    The original paper does not have an ``epsilon`` attribute,
+    it is added here for numerical stability to prevent the division by 0 error.
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            The default value is 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    **Notes**:
+        **Currently, Adamax doesn't support sparse parameter optimization.**
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.Adamax(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+    _moment_acc_str = "moment"
+    _inf_norm_acc_str = "inf_norm"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Adamax, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adamax"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        # Create accumulator tensors for first moment and infinity norm
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+            self._add_accumulator(self._inf_norm_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=self._beta1,
+                shape=[1])
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
+        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
+                                         param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        # create the adamax optimize op
+        adamax_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "Moment": moment,
+                "InfNorm": inf_norm,
+                "Beta1Pow": beta1_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": moment,
+                "InfNormOut": inf_norm
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon
+            },
+            stop_gradient=True)
+
+        return adamax_op
+
+    def _finish_update(self, block, parameters_and_grads):
+        """Update Beta1 Power accumulator
+        """
+        assert isinstance(block, framework.Block)
+        for param, grad in parameters_and_grads:
+            if grad is None or param.trainable is False:
+                continue
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamax'):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..f498fcbffa24ec188b57ceb2d3c6884fc1e135d2
--- /dev/null
+++ b/python/paddle/optimizer/adamw.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from .adam import Adam
+from ..fluid import framework
+import paddle
+__all__ = ['AdamW']
+
+
+class DecoupledWeightDecay(object):
+    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
+        if not isinstance(coeff, float) and \
+                not isinstance(coeff, framework.Variable):
+            raise TypeError("coeff should be float or Tensor.")
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._coeff = coeff
+        super(DecoupledWeightDecay, self).__init__(**kwargs)
+
+    def _scale_parameters(self, params_and_grads):
+        """
+        Adds weight decay ops.
+            scaled_parameter = parameter * coeff
+
+        Args:
+            params_and_grads: A list of (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+        if isinstance(self._coeff, float) and self._coeff == 0.0:
+            return
+
+        scaled_params = []
+        for param, grad in params_and_grads:
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                continue
+            if self._apply_decay_param_fun is not None \
+                    and not self._apply_decay_param_fun(param.name):
+                continue
+
+            if isinstance(self._coeff, float):
+                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
+                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
+            else:
+                assert self._coeff.dtype == param.dtype, \
+                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
+
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                assert param.name not in self._params_name
+                scaled_params.append((param, grad, param * self._coeff))
+                self._params_name.add(param.name)
+        return scaled_params
+
+    def backward(self, **kargs):
+        return super(DecoupledWeightDecay, self).backward(**kargs)
+
+    def _apply_optimize(self, **kargs):
+        return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        params_grads = self.backward(
+            loss=loss,
+            startup_program=startup_program,
+            parameters=parameters,
+            no_grad_set=no_grad_set)
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+
+        optimize_ops = self._apply_optimize(
+            loss=loss,
+            params_grads=params_grads,
+            startup_program=startup_program)
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
+
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
+
+
+class AdamW(DecoupledWeightDecay, Adam):
+    """
+    The AdamW optimizer is implemented based on the AdamW Optimization 
+    in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
+    it can resolves the problem of L2 regularization failure in the Adam optimizer.
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+        
+        moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+            \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
+
+        param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
+
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+        weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
+            The default value is 1e-08.
+        apply_decay_param_fun (function|None): If it is not None,
+            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            will be updated. It only works when we want to specify tensors.
+            Default: None.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+    **Notes**:
+        **Currently, AdamW doesn't support sparse parameter optimization.**
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.AdamW(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=0.0,
+                 apply_decay_param_fun=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        args_dict = {
+            "learning_rate": learning_rate,
+            "parameters": parameters,
+            "beta1": beta1,
+            "beta2": beta2,
+            "epsilon": epsilon,
+            "grad_clip": grad_clip,
+            "name": name,
+            "lazy_mode": lazy_mode
+        }
+        super(AdamW, self).__init__(
+            weight_decay,
+            apply_decay_param_fun=apply_decay_param_fun,
+            **args_dict)
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b
--- /dev/null
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -0,0 +1,1430 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy
+import warnings
+from paddle import Tensor
+
+__all__ = [
+    'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
+]
+
+
+class _LRScheduler(object):
+    """LRScheduler Base class.
+
+    Define the common interface of an LRScheduler.
+    User can 'form paddle.optimizer.lr_scheduler import _LRScheduler'
+    And inherit from it to have a custom implementation of get_lr().
+    """
+
+    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of learning rate must be float, but received {}".
+                format(type(learning_rate)))
+        self.base_lr = float(learning_rate)
+        self.last_lr = float(learning_rate)
+        self.last_epoch = last_epoch
+        self.verbose = verbose
+        self._var_name = None
+
+        self.step()
+
+    def __call__(self):
+        """ 
+        Return last computed learning rate on current epoch.
+        """
+        return self.last_lr
+
+    def step(self, epoch=None):
+        """
+        'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'.  
+        The new learning rate will take effect on next epoch.
+
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        
+        Examples:
+            Please refer to the example of current _LRScheduler. 
+        """
+        if epoch is None:
+            self.last_epoch += 1
+            self.last_lr = self.get_lr()
+        else:
+            self.last_epoch = epoch
+            if hasattr(self, "_get_closed_form_lr"):
+                self.last_lr = self._get_closed_form_lr()
+            else:
+                self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def state_dict(self):
+        """
+        Returns the state of the scheduler as a :class:`dict`.
+
+        It is a subset of self.__dict__ .
+        """
+        self._state_keys()
+        state_dict = {}
+        for key in self.keys:
+            if key not in self.__dict__:
+                continue
+            value = self.__dict__[key]
+            if isinstance(value, Tensor):
+                assert value.shape == [
+                    1
+                ], "shape of Tensor in state_dict must be [1] {}".format(
+                    value.shape)
+                value = value.numpy()[0]
+            state_dict[key] = value
+
+        return state_dict
+
+    # For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default.
+    # (Note): you can change it for your subclass.
+    def _state_keys(self):
+        """
+        set the keys in self.__dict__ that are needed to be saved.
+        """
+        self.keys = ['last_epoch', 'last_lr']
+
+    def set_dict(self, state_dict):
+        """
+        Loads the schedulers state.
+        """
+        self._state_keys()
+        for key in self.keys:
+            if key in state_dict:
+                self.__dict__[key] = state_dict[key]
+            else:
+                raise RuntimeError(
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
+                    format(key))
+        if len(state_dict) > len(self.keys):
+            warnings.warn(
+                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
+            )
+
+    # alias for set_dict
+    set_state_dict = set_dict
+
+    def get_lr(self):
+        # calculate by python float
+        raise NotImplementedError
+
+
+class NoamLR(_LRScheduler):
+    """
+
+    Applies Noam Lear to the initial learning rate. 
+
+    The algorithm can be described as following.
+
+    .. math::
+
+        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})
+
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+
+
+    Args:
+        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
+        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
+        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``NoamLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 learning_rate=1.0,
+                 last_epoch=-1,
+                 verbose=False):
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+        super(NoamLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            a = 1
+        else:
+            a = self.last_epoch**-0.5
+        b = self.warmup_steps**-1.5 * self.last_epoch
+        return self.base_lr * (self.d_model**-0.5) * min(a, b)
+
+
+class PiecewiseLR(_LRScheduler):
+    """
+
+    Piecewise learning rate scheduler.
+
+    The algorithm can be described as the code below:
+
+    .. code-block:: text
+
+        boundaries = [100, 200]
+        values = [1.0, 0.5, 0.1]
+        if epoch < 100:
+            learning_rate = 1.0
+        elif 100 <= global_step < 200:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+
+    Args:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
+        values(list): A list of learning rate values that will be picked during different epoch boundaries. 
+            The type of element in the list is python float.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``PiecewiseLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
+        self.boundaries = boundaries
+        self.values = values
+        super(PiecewiseLR, self).__init__(
+            last_epoch=last_epoch, verbose=verbose)
+
+    def get_lr(self):
+
+        for i in range(len(self.boundaries)):
+            if self.last_epoch < self.boundaries[i]:
+                return self.values[i]
+        return self.values[len(self.values) - 1]
+
+
+class NaturalExpLR(_LRScheduler):
+    """
+
+    Applies natural exponential decay to the initial learning rate.
+    
+    The algorithm can be described as following:
+
+    .. math::
+
+        new\_learning\_rate = learning\_rate * e^{- gama * epoch}
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float, optional): A Ratio to update the learning rate. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``NaturalExpLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
+
+
+class InverseTimeLR(_LRScheduler):
+    """
+
+    Applies inverse time decay to the initial learning rate.
+
+    The algorithm can be described as following:
+
+    .. math::
+
+        new\_learning\_rate = \\frac{learning\_rate}{1 + gamma * epoch}
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``InverseTimeLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr / (1 + self.gamma * self.last_epoch)
+
+
+class PolynomialLR(_LRScheduler):
+    """
+
+    Applies polynomial decay to the initial learning rate.
+
+    The algorithm can be described as following.
+
+    If cycle is set to True, then:
+
+    .. math::
+
+        decay\_steps & = decay\_steps * math.ceil(\\frac{epoch}{decay\_steps}) 
+
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+
+    If cycle is set to False, then:
+
+    .. math::
+
+        epoch & = min(epoch, decay\_steps) 
+
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        decay_steps(int): The decay step size. It determines the decay cycle.
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
+        power(float, optional): Power of polynomial. Default: 1.0.
+        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease 
+            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``PolynomialLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 end_lr=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 last_epoch=-1,
+                 verbose=False):
+        self.decay_steps = decay_steps
+        self.end_lr = end_lr
+        self.power = power
+        self.cycle = cycle
+        super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        tmp_epoch_num = self.last_epoch
+        tmp_decay_steps = self.decay_steps
+        if self.cycle:
+            div_res = math.ceil(
+                float(self.last_epoch) / float(self.decay_steps))
+
+            if self.last_epoch == 0:
+                div_res = 1
+            tmp_decay_steps = self.decay_steps * div_res
+        else:
+            tmp_epoch_num = min(self.last_epoch, self.decay_steps)
+
+        return (self.base_lr - self.end_lr) * (
+            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
+             )**self.power) + self.end_lr
+
+
+class LinearLrWarmup(_LRScheduler):
+    """
+
+    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
+    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
+    
+    When epoch < warmup_steps, learning rate is updated as:
+    
+    .. code-block:: text
+    
+            lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps)
+    
+    where start_lr is the initial learning rate, and end_lr is the final learning rate;
+    
+    When epoch >= warmup_steps, learning rate is updated as:
+    
+    .. code-block:: text
+    
+            lr = learning_rate
+    
+    where lr is float or any subclass of ``_LRScheduler`` .
+
+    Args:
+        learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` .
+        warmup_steps (int): total steps of warm up.
+        start_lr (float): Initial learning rate of warm up.
+        end_lr (float): Final learning rate of warm up.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``LinearLrWarmup`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.LinearLrWarmup(
+                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup(
+                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()      
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 start_lr,
+                 end_lr,
+                 last_epoch=-1,
+                 verbose=False):
+        type_check = isinstance(learning_rate, float) or isinstance(
+            learning_rate, int) or isinstance(learning_rate, _LRScheduler)
+        if not type_check:
+            raise TypeError(
+                "the type of learning_rate should be [int, float or _LRScheduler], the current type is {}".
+                format(learning_rate))
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
+            end_lr, start_lr)
+        super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return (self.end_lr - self.start_lr) * float(
+                self.last_epoch) / float(self.warmup_steps) + self.start_lr
+        else:
+            if isinstance(self.learning_rate, _LRScheduler):
+                self.learning_rate.step()
+                return self.learning_rate()
+
+            return self.learning_rate
+
+
+class ExponentialLR(_LRScheduler):
+    """
+
+    Update learning rate by 'gamma' each epoch.
+
+    The algorithm can be described as following.
+    
+    .. math::
+
+        new\_learning\_rate = last\_learning\_rate * gamma
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``ExponentialLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * (self.gamma**self.last_epoch)
+
+
+class MultiStepLR(_LRScheduler):
+    """
+    Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        milestones = [30, 50]
+        gamma = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        
+
+    Returns:
+        ``MultiStepLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 milestones,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.milestones = milestones
+        self.gamma = gamma
+        super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
+
+
+class StepLR(_LRScheduler):
+    """
+    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        step_size = 30
+        gamma = 0.1
+
+        learning_rate = 0.5     if epoch < 30
+        learning_rate = 0.05    if 30 <= epoch < 60
+        learning_rate = 0.005   if 60 <= epoch < 90
+        ...
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        step_size (int): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``StepLR`` instance to schedule learning rate.
+
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_size,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(step_size, int):
+            raise TypeError(
+                "The type of 'step_size' must be 'int', but received %s." %
+                type(step_size))
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.step_size = step_size
+        self.gamma = gamma
+        super(StepLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        i = self.last_epoch // self.step_size
+        return self.base_lr * (self.gamma**i)
+
+
+class LambdaLR(_LRScheduler):
+    """
+    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5        # init learning_rate
+        lr_lambda = lambda epoch: 0.95 ** epoch
+
+        learning_rate = 0.5        # epoch 0
+        learning_rate = 0.475      # epoch 1
+        learning_rate = 0.45125    # epoch 2
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    
+    Returns:
+        ``LambdaLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s."
+                % type(lr_lambda))
+
+        self.lr_lambda = lr_lambda
+        super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * self.lr_lambda(self.last_epoch)
+
+
+class ReduceLROnPlateau(_LRScheduler):
+    """
+    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate 
+    by 2 to 10 times once model performance has no longer improvement.
+
+    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics`` 
+    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` . 
+    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience`` 
+    number of epochs, the learning rate will be reduced.)
+
+    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the 
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning 
+            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` . 
+            It should be less than 1.0. Default: 0.1.
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. 
+            Default: 10.
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . 
+            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
+        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum 
+            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
+        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
+        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, 
+            the update is ignored. Default: 1e-8.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
+
+    
+    Returns:
+        ``ReduceLROnPlateau`` instance to schedule learning rate.
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step(loss)
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step(out[0])
+
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 mode='min',
+                 factor=0.1,
+                 patience=10,
+                 threshold=1e-4,
+                 threshold_mode='rel',
+                 cooldown=0,
+                 min_lr=0,
+                 epsilon=1e-8,
+                 verbose=False):
+        mode = mode.lower()
+        if mode not in ['min', 'max']:
+            raise ValueError('mode: ' + mode + ' is unknown!')
+        self.mode = mode
+
+        if factor >= 1.0:
+            raise ValueError(
+                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
+        self.factor = factor
+
+        threshold_mode = threshold_mode.lower()
+        if threshold_mode not in ['rel', 'abs']:
+            raise ValueError('threshold mode: ' + threshold_mode +
+                             ' is unknown!')
+        self.threshold_mode = threshold_mode
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s."
+                % type(learning_rate))
+
+        self.verbose = verbose
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.min_lr = min_lr
+        self.epsilon = epsilon
+
+        self.cooldown_counter = 0
+        self.best = None
+        self.num_bad_epochs = 0
+
+        # Can not call Parent __init__, so implement here.
+        self.base_lr = float(learning_rate)
+        self.last_lr = float(learning_rate)
+        self.last_epoch = 0
+        self.verbose = verbose
+        self._var_name = None
+
+    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
+    def _state_keys(self):
+        self.keys = [
+            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
+            'last_lr'
+        ]
+
+    def step(self, metrics, epoch=None):
+        """
+        step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` .  
+        The new learning rate will take effect on next epoch.
+
+        Args:
+            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce. 
+                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
+                'numpy.ndarray', its shape must be [1].
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        
+        Examples:
+            Please refer to the example of current _LRScheduler.
+        """
+        if epoch is None:
+            self.last_epoch = self.last_epoch + 1
+        else:
+            self.last_epoch = epoch
+
+        # loss must be 1-D Tensor with shape [1]
+        if isinstance(metrics, (Tensor, numpy.ndarray)):
+            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
+                "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
+                "you should call paddle.mean to process it first.".format(loss.shape)
+        elif not isinstance(metrics,
+                            (int, float, numpy.float32, numpy.float64)):
+            raise TypeError(
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
+                format(type(metrics)))
+
+        if self.cooldown_counter > 0:
+            self.cooldown_counter -= 1
+        else:
+            if self.best is None or self._is_better(metrics, self.best):
+                self.best = metrics
+                self.num_bad_epochs = 0
+            else:
+                self.num_bad_epochs += 1
+
+            if self.num_bad_epochs > self.patience:
+                self.cooldown_counter = self.cooldown
+                self.num_bad_epochs = 0
+                new_lr = max(self.last_lr * self.factor, self.min_lr)
+                if self.last_lr - new_lr > self.epsilon:
+                    self.last_lr = new_lr
+                    if self.verbose:
+                        print('Epoch {}: {} set learning rate to {}.'.format(
+                            self.last_epoch, self.__class__.__name__,
+                            self.last_lr))
+
+    def _is_better(self, current, best):
+        print("mode", self.mode, 'threshold_mode', self.threshold_mode)
+        if self.mode == 'min' and self.threshold_mode == 'rel':
+            return current < best - best * self.threshold
+
+        elif self.mode == 'min' and self.threshold_mode == 'abs':
+            return current < best - self.threshold
+
+        elif self.mode == 'max' and self.threshold_mode == 'rel':
+            return current > best + best * self.threshold
+
+        else:
+            return current > best + self.threshold
+
+
+class CosineAnnealingLR(_LRScheduler):
+    """
+
+    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
+    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
+    SGDR:
+
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    The algorithm can be described as following.
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+    
+    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_. 
+    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
+    
+    Args:
+        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
+        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate.
+        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``CosineAnnealingLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 T_max,
+                 eta_min=0,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(T_max, int):
+            raise TypeError(
+                "The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s."
+                % type(T_max))
+        if not isinstance(eta_min, (float, int)):
+            raise TypeError(
+                "The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s."
+                % type(eta_min))
+        self.T_max = T_max
+        self.eta_min = float(eta_min)
+        super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+    def _get_closed_form_lr(self):
+        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
+            math.pi * self.last_epoch / self.T_max)) / 2
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9de0cefc05d1aaee36fa3af5cfa9ae4affcb97
--- /dev/null
+++ b/python/paddle/optimizer/optimizer.py
@@ -0,0 +1,921 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import six
+import logging
+from collections import defaultdict
+
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+import paddle
+
+from ..fluid import framework
+from ..fluid import layers
+from ..fluid import unique_name
+from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
+from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
+from ..fluid.framework import program_guard
+from ..fluid.initializer import Constant
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.layers import ops
+from ..fluid.regularizer import append_regularization_ops
+from ..fluid.dygraph import base as imperative_base
+from ..fluid.dygraph import no_grad
+from paddle.fluid import core
+from paddle.fluid.layers import tensor
+from functools import reduce
+from ..fluid.wrapped_decorator import signature_safe_contextmanager
+from .. import compat as cpt
+from .lr_scheduler import _LRScheduler
+
+__all__ = ['Optimizer']
+
+
+class Optimizer(object):
+    """Optimizer Base class.
+
+    Define the common interface of an optimizer.
+    User should not use this class directly,
+    but need to use one of it's implementation.
+
+    Args:
+        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or any subclass of ``_LRScheduler`` .
+        parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
+            some derived class of ``GradientClipBase`` . There are three cliping strategies \
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    Returns:
+       Base class for optimizer. 
+    
+    Examples:
+        .. code-block:: python
+
+            #Take the subclass adam as an example
+            #Optimizer 
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    @imperative_base.no_grad()
+    def __init__(self,
+                 learning_rate,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        self._parameter_list = list(
+            parameters) if parameters is not None else None
+        self._name = name
+        if framework.in_dygraph_mode():
+            if self._parameter_list is None:
+                raise AttributeError(
+                    "parameters argument given to the Optimizer should not be None in dygraph mode."
+                )
+            if weight_decay is not None:
+                for param in self._parameter_list:
+                    if param.regularizer is not None:
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
+                            "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % weight_decay.__str__())
+                        break
+        if not isinstance(learning_rate, (float, _LRScheduler)):
+            raise TypeError(
+                "learning rate should be float or _LRScheduler, got %s here" %
+                type(learning_rate))
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+        if isinstance(weight_decay, float):
+            from ..fluid.regularizer import L2Decay
+            self.regularization = L2Decay(weight_decay)
+        else:
+            self.regularization = weight_decay
+        self._grad_clip = grad_clip
+        self._learning_rate = learning_rate
+        # the learning rate type should be inferenced from loss
+        self._dtype = None
+        # each program should have a independent learning rate
+        # program -> tensor(learning_rate)
+        self._learning_rate_map = dict()
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra tensors associated with the parameters
+        # to train. These tensors are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+        self._opti_name_list = []
+        self._accumulators_holder = {}
+        self._param_device_map = dict()
+        self.clear_gradients = self.clear_grad
+
+    @framework.dygraph_only
+    def state_dict(self):
+        '''
+        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict.
+        If the optimizer never be called(minimize function), the state_dict is empty.
+
+        Args: 
+            None
+
+        Returns:
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+
+                adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+                state_dict = adam.state_dict()
+
+        '''
+        state_dict = {}
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                state_dict[var_tmp.name] = var_tmp
+        # global step if use lr decay
+        if isinstance(self._learning_rate, _LRScheduler):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+        return state_dict
+
+    @framework.dygraph_only
+    def set_state_dict(self, state_dict):
+        '''
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed.
+
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+        Return:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+
+                state_dict = emb.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+
+                adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), 
+                                            parameters=emb.parameters())
+                state_dict = adam.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+
+                adam.set_state_dict(opti_state_dict)
+
+        '''
+
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
+
+        self._accumulators_holder = state_dict
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                assert var_tmp.name in state_dict, \
+                        "optimizer Tensor {} not found".format( var_tmp.name )
+                var = var_tmp.value()
+                tensor = var.get_tensor()
+                model_np = np.array(tensor)
+
+                load_para = state_dict[var_tmp.name]
+
+                if isinstance(load_para, Variable):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, core.VarBase):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, np.ndarray):
+                    load_para_np = load_para
+                else:
+                    raise RuntimeError("State dict type {} not supprt".format(
+                        str(type(load_para))))
+
+                assert model_np.shape == load_para_np.shape,  \
+                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                                                 item.name, model_np.shape, load_para_np.shape)
+
+                assert model_np.dtype == load_para_np.dtype, \
+                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                                                item.name, model_np.dtype, load_para_np.dtype)
+
+                tensor.set(load_para_np, framework._current_expected_place())
+
+    def get_opti_var_name_list(self):
+        return self._opti_name_list
+
+    def _create_global_learning_rate(self):
+        if isinstance(self._learning_rate, _LRScheduler):
+            lr_var = self._global_learning_rate()
+            # only create global lr_var once
+            if not isinstance(lr_var, framework.Variable):
+                lr_name = unique_name.generate('learning_rate')
+                self._learning_rate._var_name = lr_name
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype=paddle.get_default_dtype()
+                    if self._dtype is None else self._dtype)
+                main_prog = framework.default_main_program()
+                main_prog.lr_sheduler = self._learning_rate
+                main_prog.lr_var = lr_var
+                self._learning_rate_map[framework.default_main_program(
+                )] = lr_var
+
+            lr_value = float(self._learning_rate())
+            self.helper.set_variable_initializer(
+                lr_var, initializer=Constant(value=lr_value))
+        elif isinstance(self._learning_rate, float):
+            # only create global lr_var once
+            lr = self._global_learning_rate()
+            if isinstance(lr, framework.Variable):
+                return
+            else:
+                self._learning_rate_map[framework.default_main_program(
+                )] = layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._learning_rate),
+                    dtype=paddle.get_default_dtype()
+                    if self._dtype is None else self._dtype,
+                    persistable=True)
+
+    @framework.dygraph_only
+    def set_lr(self, value):
+        """
+        :api_attr: imperative
+        
+        Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler,
+        this API cannot be invoked, because it will lead to conflict.
+
+        Args:
+            value (float|Tensor): the value of learning rate
+
+        Returns:
+            None
+          
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                linear = paddle.nn.Linear(10, 10)
+
+                adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+                # set learning rate manually by python float value
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                for i in range(5):
+                    adam.set_lr(lr_list[i])
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                # Print:
+                #    current lr is 0.2
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
+
+        """
+        if not isinstance(value, (int, float)):
+            raise TypeError(
+                "The type of 'value' in optimizer.set_lr must be float, but received %s."
+                % (type(value)))
+        if isinstance(self._learning_rate, _LRScheduler):
+            raise RuntimeError(
+                "optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict."
+            )
+        self._learning_rate = float(value)
+        current_lr = self._global_learning_rate()
+        if current_lr is not None:
+            global_block = framework.default_main_program().global_block()
+            global_block.append_op(
+                type='fill_constant',
+                outputs={'Out': [current_lr]},
+                attrs={
+                    'dtype': current_lr.dtype,
+                    'shape': list(current_lr.shape),
+                    'value': float(value)
+                },
+                stop_gradient=True)
+
+    @framework.dygraph_only
+    def get_lr(self):
+        """
+        :api_attr: imperative
+        
+        Get current step learning rate. The return value is all the same When _LRScheduler is not used,
+        otherwise return the current step learning rate.
+
+        Returns:
+            float: The learning rate of the current step.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                # example1: _LRScheduler is not used, return value is all the same
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+                adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
+                lr = adam.get_lr()
+                print(lr) # 0.001
+
+                # example2: PiecewiseLR is used, return the step learning rate
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.reduce_mean(out)
+                
+                bd = [2, 4, 6, 8]
+                value = [0.2, 0.4, 0.6, 0.8, 1.0]
+                scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0)
+                adam = paddle.optimizer.Adam(scheduler,
+                                       parameters=linear.parameters())
+
+                # first step: learning rate is 0.2
+                np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True
+
+                # learning rate for different steps
+                ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
+                for i in range(12):
+                    adam.step()
+                    lr = adam.get_lr()
+                    scheduler.step()
+                    np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True
+
+        """
+        if isinstance(self._learning_rate, float):
+            return self._learning_rate
+        else:
+            return self._learning_rate()
+
+    def _global_learning_rate(self, program=None):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        if program is None:
+            program = framework.default_main_program()
+        return self._learning_rate_map.get(program, None)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        """ append optimize operator to block and return all the added optimize_op
+        """
+        raise NotImplementedError(
+            "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
+        )
+
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate tensor for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        if type(param_lr) == Variable:
+            return param_lr
+        else:
+            if param_lr == 1.0:
+                return self._global_learning_rate()
+            else:
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
+                    return self._global_learning_rate() * param_lr
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+        """
+        pass
+
+    def _finish_update(self, block, parameters_and_grads):
+        """Finish any custom updates needed
+           before completing an optimization step
+
+        Args:
+            block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+
+        Returns:
+            None
+        """
+        pass
+
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None,
+                         type=None,
+                         device=None):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss tensor is present
+            name: name of the accumulator
+            param: parameter tensor for which accumulator is to be added
+            dtype: data type of the accumulator tensor
+            fill_value: value to initialize the accumulator tensor
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            if framework.in_dygraph_mode():
+                return self._accumulators[name][param.name]
+            raise Exception("Accumulator {} already exists for parameter {}".
+                            format(name, param.name))
+        if shape == None:
+            shape = param.shape
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_" + name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
+        var = self.helper.create_global_variable(
+            name=var_name,
+            persistable=True,
+            dtype=dtype or param.dtype,
+            type=param.type if type is None else type,
+            shape=shape,
+            belong_to_optimizer=True)
+        if device is None:
+            device = self._get_device_for_param(param.name)
+        with device_guard(device):
+            self.helper.set_variable_initializer(
+                var, initializer=Constant(value=float(fill_value)))
+
+        if framework.in_dygraph_mode():
+            if len(self._accumulators_holder) > 0:
+                assert var_name in self._accumulators_holder, \
+                        "Optimizer set error, {} should in state dict".format( var_name )
+                var.set_value(self._accumulators_holder[var_name])
+
+        self._accumulators[name][param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter tensor for which accumulator is to be fetched
+
+        Returns:
+            accumulator tensor for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def _update_param_device_map(self, parameters_and_grads, target_block):
+        for param_and_grad in parameters_and_grads:
+            if param_and_grad[0].trainable is True:
+                param_name = param_and_grad[0].name
+                ops = target_block.ops
+                device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+                )
+                for op in ops:
+                    input_arg_names = op.input_arg_names
+                    if param_name in input_arg_names:
+                        self._param_device_map[param_name] = op.attr(
+                            device_attr_name)
+                        break
+
+    def _get_device_for_param(self, param_name):
+        device = None
+        if param_name in self._param_device_map:
+            device = self._param_device_map[param_name]
+        return device
+
+    def _create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to tensors.
+
+        Args:
+          parameters_and_grads(list(tuple(Tensor, Tensor))):
+            a list of (tensor, gradient) pair to update.
+
+        Returns:
+          return_op_list: a list of operators that will complete one step of
+            optimization. This will include parameter update ops, global step
+            update ops and any other custom ops required by subclasses to manage
+            their internal state.
+        """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters and extend _finish_update method to add custom ops.
+
+        # Allways called under program_guard use global block as loss block
+        # But if current block is in control flow, append optimize op in the
+        # grad block of current block
+
+        global_block = framework.default_main_program().global_block()
+        target_block = global_block
+        current_block = framework.default_main_program().current_block()
+        if current_block.idx != global_block.idx:
+            assert current_block.backward_block_idx != -1, \
+                "current block is not global_block, but it doesn't have backward block."
+            target_block = framework.default_main_program().blocks[
+                current_block.backward_block_idx]
+
+        start = len(target_block.ops)
+        self.helper = LayerHelper(self.__class__.__name__)
+        self._update_param_device_map(parameters_and_grads, target_block)
+        self._create_accumulators(
+            target_block,
+            [p[0] for p in parameters_and_grads if p[0].trainable])
+        self._create_global_learning_rate()
+
+        if framework.in_dygraph_mode():
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].trainable is True:
+                    self._append_optimize_op(target_block, param_and_grad)
+        else:
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad), name_scope("optimizer"):
+                    if param_and_grad[0].trainable is True:
+                        device = self._get_device_for_param(param_and_grad[0]
+                                                            .name)
+                        with device_guard(device):
+                            optimize_op = self._append_optimize_op(
+                                target_block, param_and_grad)
+
+        # Get custom finish ops for subclasses
+        # FIXME: Need to fix this once we figure out how to handle dependencies
+        self._finish_update(target_block, parameters_and_grads)
+
+        end = len(target_block.ops)
+        return target_block._slice_ops(start, end)
+
+    def _append_dgc_ops(self, param_and_grad):
+        pass
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        The first part of ``minimize``, do auto-diff to append backward operations for
+        the current program.
+
+        Args:
+            loss (Tensor): ``loss`` tensor to run optimizations.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameters``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+                to be updated. The default value is None.
+            callbacks (list, optional): list of callable objects to run when appending backward
+                operator for one parameter. The default value is None.
+
+        Return:
+            list: list of (param, grad) tensor pairs, param is ``Parameter``,
+                grad is the gradient value corresponding to the parameter.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        act_no_grad_set = None
+        if framework.in_dygraph_mode():
+            pass
+        else:
+            act_no_grad_set = self._get_no_grad_set(loss, no_grad_set)
+
+        self._dtype = loss.dtype
+        if framework.in_dygraph_mode():
+            params_grads = []
+            for param in self._parameter_list:
+                if not param.trainable:
+                    continue
+                if param._grad_ivar() is not None:
+                    # create gradient tensor
+                    grad_var = param._grad_ivar()
+                    params_grads.append((param, grad_var))
+        else:
+            if callbacks is None:
+                callbacks = [error_clip_callback]
+            else:
+                assert (isinstance(callbacks, list))
+            program = loss.block.program
+            assert len(loss.shape) == 1 and loss.shape[0] == 1, \
+                "The loss.shape should be (1L,), but the current loss.shape is {}. " \
+                "Maybe that you should call paddle.mean to process the current loss.".format(
+                    loss.shape)
+            parameter_list = parameters if parameters \
+                else self._parameter_list
+            with program_guard(program, startup_program):
+                params_grads = append_backward(loss, parameter_list,
+                                               act_no_grad_set, callbacks)
+                # Note: since we can't use all_reduce_op now,
+                #  dgc_op should be the last op of one grad.
+                self._append_dgc_ops(params_grads)
+        return params_grads
+
+    def apply_gradients(self, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+
+        Args:
+            params_grads (list): list of (param, grad) pair to do optimization.
+
+        Returns:
+            list: A list of operators appended to the current program.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.mean(out)
+                optimizer = paddle.optimizer.Adam(learning_rate=0.1,
+                        parameters=linear.parameters())
+                params_grads = optimizer.backward(loss)
+                optimizer.apply_gradients(params_grads)
+
+        """
+
+        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+
+        # 'optimizer(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            params_grads = self._grad_clip(params_grads)
+        else:
+
+            params_grads = append_gradient_clip_ops(params_grads)
+
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
+
+        optimize_ops = self._create_optimization_pass(params_grads)
+        return optimize_ops
+
+    def _apply_optimize(self, loss, startup_program, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+        Args:
+            loss (Tensor): loss tensor to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameters`.
+            params_grads (list): list of (param, grad) pair to do optimization.
+        Returns:
+            list: A list of operators appended to the current program.
+        """
+        if framework.in_dygraph_mode():
+            with program_guard(framework.default_main_program(),
+                               framework.default_startup_program()):
+                if self._grad_clip is not None:
+                    params_grads = self._grad_clip(params_grads)
+                params_grads = append_regularization_ops(params_grads,
+                                                         self.regularization)
+                optimize_ops = self._create_optimization_pass(params_grads)
+        else:
+            program = loss.block.program
+            with program_guard(program, startup_program):
+                optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
+    def _get_no_grad_set(self, loss, no_grad_set=None):
+        no_grad_set = _get_no_grad_set_name(no_grad_set)
+        parameters = loss.block.program.global_block().all_parameters()
+        param_no_trainable = set(
+            [param.name for param in parameters if param.trainable is False])
+        # If the parameter is no trainable, it should not have a gradient.
+        no_grad_set.update(param_no_trainable)
+
+        return no_grad_set
+
+    @framework.dygraph_only
+    def clear_grad(self):
+        """
+        Clear the gradients of all optimized parameters for model.
+        
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+
+        """
+        for p in self._parameter_list:
+            if p.trainable:
+                p.clear_gradient()
+
+    @imperative_base.no_grad()
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        """
+        Add operations to minimize ``loss`` by updating ``parameters``.
+
+        Args:
+            loss (Tensor): A ``Tensor`` containing the value to minimize.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameters``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+                to be updated. The default value is None.
+
+        Returns:
+            tuple: tuple (optimize_ops, params_grads), A list of operators appended
+            by minimize and a list of (param, grad) tensor pairs, param is
+            ``Parameter``, grad is the gradient value corresponding to the parameter.
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
+            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            ``fetch_list`` before run, see details in ``Executor``.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.fluid as fluid
+
+                place = fluid.CPUPlace()
+                main = fluid.Program()
+                with fluid.program_guard(main):
+                    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+                    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+                    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+                    avg_cost = fluid.layers.mean(cost)
+
+                    adam_optimizer = paddle.optimizer.Adam(0.01)
+                    adam_optimizer.minimize(avg_cost)
+
+                    fetch_list = [avg_cost]
+                    train_reader = paddle.batch(
+                        paddle.dataset.uci_housing.train(), batch_size=1)
+                    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+                    exe = fluid.Executor(place)
+                    exe.run(fluid.default_startup_program())
+                    for data in train_reader():
+                        exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+        """
+        assert isinstance(loss, Variable), "The loss should be an Tensor."
+
+        parameter_list = parameters if parameters \
+            else self._parameter_list
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameters=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer once.
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc4c9bfd53dc15449f03d6de6c8942e977bf562
--- /dev/null
+++ b/python/paddle/optimizer/rmsprop.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["RMSProp"]
+
+
+class RMSProp(Optimizer):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
+
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+
+    Parameters:
+        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay.
+        rho(float): rho is :math: `\\rho` in equation, default is 0.95.
+        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
+            avoid division by zero, default is 1e-6.
+        momentum(float): :math:`\\beta` in equation is the momentum term,
+            default is 0.0.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    _momentum_acc_str = "momentum"
+    _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
+
+    def __init__(self,
+                 learning_rate,
+                 rho=0.95,
+                 epsilon=1.0e-6,
+                 momentum=0.0,
+                 centered=False,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if momentum is None:
+            raise ValueError("momentum is not set.")
+
+        super(RMSProp, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+
+        self.type = "rmsprop"
+        self._rho = rho
+        self._epsilon = epsilon
+        self._momentum = momentum
+        self._centered = centered
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._momentum_acc_str, p)
+            self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        momentum_acc = self._get_accumulator(self._momentum_acc_str,
+                                             param_and_grad[0])
+        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
+                                                param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum,
+                "centered": self._centered
+            },
+            stop_gradient=True)
+
+        return rmsprop_op
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a28a4f04e368cf8a1c1a144639bc743234a540
--- /dev/null
+++ b/python/paddle/static/__init__.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: import framework api under this directory 
+__all__ = [
+    'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
+    'BuildStrategy', 'CompiledProgram', 'Print', 'py_func', 'ExecutionStrategy',
+    'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
+    'default_main_program', 'default_startup_program', 'Program', 'save',
+    'load', 'data', 'InputSpec'
+]
+
+from . import nn
+from .input import data  #DEFINE_ALIAS
+from .input import InputSpec  #DEFINE_ALIAS
+from ..fluid.executor import Executor  #DEFINE_ALIAS
+from ..fluid.executor import global_scope  #DEFINE_ALIAS
+from ..fluid.executor import scope_guard  #DEFINE_ALIAS
+from ..fluid.backward import append_backward  #DEFINE_ALIAS
+from ..fluid.backward import gradients  #DEFINE_ALIAS
+from ..fluid.compiler import BuildStrategy  #DEFINE_ALIAS
+from ..fluid.compiler import CompiledProgram  #DEFINE_ALIAS
+from ..fluid.compiler import ExecutionStrategy  #DEFINE_ALIAS
+from ..fluid.framework import default_main_program  #DEFINE_ALIAS
+from ..fluid.framework import default_startup_program  #DEFINE_ALIAS
+from ..fluid.framework import Program  #DEFINE_ALIAS
+from ..fluid.framework import name_scope  #DEFINE_ALIAS
+from ..fluid.framework import program_guard  #DEFINE_ALIAS
+from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
+from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
+from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
+from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
+from ..tensor.io import save  #DEFINE_ALIAS
+from ..tensor.io import load  #DEFINE_ALIAS
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb70320ea7551de6e1117900e3769f000fdf23dd
--- /dev/null
+++ b/python/paddle/static/input.py
@@ -0,0 +1,330 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+import paddle
+from paddle.fluid import core, Variable
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import check_type
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+__all__ = ['data', 'InputSpec']
+
+
+def data(name, shape, dtype=None, lod_level=0):
+    """
+    **Data Layer**
+
+    This function creates a variable on the global block. The global variable
+    can be accessed by all the following operators in the graph. The variable
+    is a placeholder that could be fed with input, such as Executor can feed
+    input into the variable. When `dtype` is None, the dtype
+    will get from the global dtype by `paddle.get_default_dtype()`.
+
+    Args:
+       name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+           for more details.
+       shape (list|tuple): List|Tuple of integers declaring the shape. You can
+           set "None" or -1 at a dimension to indicate the dimension can be of any
+           size. For example, it is useful to set changeable batch size as "None" or -1.
+       dtype (np.dtype|str, optional): The type of the data. Supported
+           dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+           uint8. Default: None. When `dtype` is not set, the dtype will get
+           from the global dtype by `paddle.get_default_dtype()`.
+       lod_level (int, optional): The LoD level of the LoDTensor. Usually users
+           don't have to set this value. For more details about when and how to
+           use LoD level, see :ref:`user_guide_lod_tensor` . Default: 0.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+
+          # Creates a variable with fixed size [3, 2, 1]
+          # User can only feed data of the same shape to x
+          # the dtype is not set, so it will set "float32" by
+          # paddle.get_default_dtype(). You can use paddle.get_default_dtype() to
+          # change the global dtype
+          x = paddle.static.data(name='x', shape=[3, 2, 1])
+
+          # Creates a variable with changeable batch size -1.
+          # Users can feed data of any batch size into y,
+          # but size of each data sample has to be [2, 1]
+          y = paddle.static.data(name='y', shape=[-1, 2, 1], dtype='float32')
+
+          z = x + y
+
+          # In this example, we will feed x and y with np-ndarray "1"
+          # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
+          feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
+
+          exe = paddle.static.Executor(paddle.framework.CPUPlace())
+          out = exe.run(paddle.static.default_main_program(),
+                        feed={
+                            'x': feed_data,
+                            'y': feed_data
+                        },
+                        fetch_list=[z.name])
+
+          # np-ndarray of shape=[3, 2, 1], dtype=float32, whose elements are 2
+          print(out)
+
+    """
+    helper = LayerHelper('data', **locals())
+    check_type(name, 'name', (six.binary_type, six.text_type), 'data')
+    check_type(shape, 'shape', (list, tuple), 'data')
+
+    shape = list(shape)
+    for i in six.moves.range(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+
+    if dtype:
+        return helper.create_global_variable(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            stop_gradient=True,
+            lod_level=lod_level,
+            is_data=True,
+            need_check_feed=True)
+    else:
+        return helper.create_global_variable(
+            name=name,
+            shape=shape,
+            dtype=paddle.get_default_dtype(),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            stop_gradient=True,
+            lod_level=lod_level,
+            is_data=True,
+            need_check_feed=True)
+
+
+class InputSpec(object):
+    """
+    InputSpec describes the signature information of the model input, such as ``shape`` , ``dtype`` , ``name`` .
+
+    This interface is often used to specify input tensor information of models in high-level API.
+    It's also used to specify the tensor information for each input parameter of the forward function
+    decorated by `@paddle.jit.to_static`.
+
+    Args:
+        shape (tuple(integers)|list[integers]): List|Tuple of integers
+            declaring the shape. You can set "None" or -1 at a dimension
+            to indicate the dimension can be of any size. For example,
+            it is useful to set changeable batch size as "None" or -1.
+        dtype (np.dtype|str, optional): The type of the data. Supported
+            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+            uint8. Default: float32.
+        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+            for more details.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.static import InputSpec
+
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            print(input)  # InputSpec(shape=(-1, 784), dtype=VarType.FP32, name=x)
+            print(label)  # InputSpec(shape=(-1, 1), dtype=VarType.INT64, name=label)
+    """
+
+    def __init__(self, shape, dtype='float32', name=None):
+        # replace `None` in shape  with -1
+        self.shape = self._verify(shape)
+        # convert dtype into united represention
+        if dtype is not None:
+            if not isinstance(dtype, core.VarDesc.VarType):
+                dtype = convert_np_dtype_to_dtype_(dtype)
+        self.dtype = dtype
+        self.name = name
+
+    def _create_feed_layer(self):
+        return data(self.name, shape=self.shape, dtype=self.dtype)
+
+    def __repr__(self):
+        return '{}(shape={}, dtype={}, name={})'.format(
+            type(self).__name__, self.shape, self.dtype, self.name)
+
+    @classmethod
+    def from_tensor(cls, tensor, name=None):
+        """
+        Generates a InputSpec based on the description of input tensor.
+
+        Args:
+            tensor(Tensor): the source tensor to generate a InputSpec instance
+
+        Returns:
+            A InputSpec instance generated from Tensor.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.static import InputSpec
+
+                paddle.disable_static()
+
+                x = paddle.to_tensor(np.ones([2, 2], np.float32))
+                x_spec = InputSpec.from_tensor(x, name='x')
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+
+        """
+        if isinstance(tensor, (Variable, core.VarBase)):
+            return cls(tensor.shape, tensor.dtype, name or tensor.name)
+        else:
+            raise ValueError(
+                "Input `tensor` should be a Tensor, but received {}.".format(
+                    type(tensor).__name__))
+
+    @classmethod
+    def from_numpy(cls, ndarray, name=None):
+        """
+        Generates a InputSpec based on the description of input np.ndarray.
+
+        Args:
+            tensor(Tensor): the source numpy ndarray to generate a InputSpec instance
+
+        Returns:
+            A InputSpec instance generated from Tensor.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                from paddle.static import InputSpec
+
+                x = np.ones([2, 2], np.float32)
+                x_spec = InputSpec.from_numpy(x, name='x')
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+
+        """
+        return cls(ndarray.shape, ndarray.dtype, name)
+
+    def batch(self, batch_size):
+        """
+        Inserts `batch_size` in front of the `shape`.
+
+        Args:
+            batch_size(int): the inserted integer value of batch size.
+
+        Returns:
+            The original InputSpec instance by inserting `batch_size` in front of `shape`.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.static import InputSpec
+
+                x_spec = InputSpec(shape=[64], dtype='float32', name='x')
+                x_spec.batch(4)
+                print(x_spec) # InputSpec(shape=(4, 64), dtype=VarType.FP32, name=x)
+
+        """
+        if isinstance(batch_size, (list, tuple)):
+            if len(batch_size) != 1:
+                raise ValueError(
+                    "Length of batch_size: {} shall be 1, but received {}.".
+                    format(batch_size, len(batch_size)))
+            batch_size = batch_size[1]
+        elif not isinstance(batch_size, six.integer_types):
+            raise TypeError("type(batch_size) shall be `int`, but received {}.".
+                            format(type(batch_size).__name__))
+
+        new_shape = [batch_size] + list(self.shape)
+        self.shape = tuple(new_shape)
+
+        return self
+
+    def unbatch(self):
+        """
+        Removes the first element of `shape`.
+
+        Returns:
+            The original InputSpec instance by removing the first element of `shape` .
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.static import InputSpec
+
+                x_spec = InputSpec(shape=[4, 64], dtype='float32', name='x')
+                x_spec.unbatch()
+                print(x_spec) # InputSpec(shape=(64,), dtype=VarType.FP32, name=x)
+
+        """
+        if len(self.shape) == 0:
+            raise ValueError(
+                "Not support to unbatch a InputSpec when len(shape) == 0.")
+
+        self.shape = self._verify(self.shape[1:])
+        return self
+
+    def _verify(self, shape):
+        """
+        Verifies the input shape and modifies `None` into `-1`.
+        """
+        if not isinstance(shape, (list, tuple)):
+            raise TypeError(
+                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".
+                format(type(shape).__name__))
+        if len(shape) == 0:
+            raise ValueError(
+                "`shape` in InputSpec should contain at least 1 element, but received {}.".
+                format(shape))
+
+        for i, ele in enumerate(shape):
+            if ele is not None:
+                if not isinstance(ele, six.integer_types):
+                    raise ValueError(
+                        "shape[{}] should be an `int`, but received `{}`:{}.".
+                        format(i, type(ele).__name__, ele))
+            if ele is None or ele < -1:
+                shape[i] = -1
+
+        return tuple(shape)
+
+    def __hash__(self):
+        # Note(Aurelius84): `name` is not considered as a field to compute hashkey.
+        # Because it's no need to generate a new program in following cases while using
+        # @paddle.jit.to_static.
+        #
+        # Case 1:
+        #      foo(x_var)
+        #      foo(y_var)
+        #  x_var and y_var hold same shape and dtype, they should share a same program.
+        #
+        #
+        # Case 2:
+        #      foo(x_var)
+        #      foo(x_np)  # x_np is a numpy.ndarray.
+        #  x_var and x_np hold same shape and dtype, they should also share a same program.
+        return hash((tuple(self.shape), self.dtype))
+
+    def __eq__(self, other):
+        slots = ['shape', 'dtype', 'name']
+        return (type(self) is type(other) and all(
+            getattr(self, attr) == getattr(other, attr) for attr in slots))
+
+    def __ne__(self, other):
+        return not self == other
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91da0926b1870bb4a7999e62965c135dcf36bf25
--- /dev/null
+++ b/python/paddle/static/nn/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'fc',
+    'batch_norm',
+    'embedding',
+    'bilinear_tensor_product',
+    'conv2d',
+    'conv2d_transpose',
+    'conv3d',
+    'conv3d_transpose',
+    'create_parameter',
+    'crf_decoding',
+    'data_norm',
+    'deformable_conv',
+    'group_norm',
+    'hsigmoid',
+    'instance_norm',
+    'layer_norm',
+    'multi_box_head',
+    'nce',
+    'prelu',
+    'row_conv',
+    'spectral_norm',
+]
+
+from ...fluid.layers import fc  #DEFINE_ALIAS
+from ...fluid.layers import batch_norm  #DEFINE_ALIAS
+from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
+from ...fluid.layers import conv2d  #DEFINE_ALIAS
+from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
+from ...fluid.layers import conv3d  #DEFINE_ALIAS
+from ...fluid.layers import conv3d_transpose  #DEFINE_ALIAS
+from ...fluid.layers import create_parameter  #DEFINE_ALIAS
+from ...fluid.layers import crf_decoding  #DEFINE_ALIAS
+from ...fluid.layers import data_norm  #DEFINE_ALIAS
+from ...fluid.layers import deformable_conv  #DEFINE_ALIAS
+from ...fluid.layers import group_norm  #DEFINE_ALIAS
+from ...fluid.layers import hsigmoid  #DEFINE_ALIAS
+from ...fluid.layers import instance_norm  #DEFINE_ALIAS
+from ...fluid.layers import layer_norm  #DEFINE_ALIAS
+from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
+from ...fluid.layers import nce  #DEFINE_ALIAS
+from ...fluid.layers import prelu  #DEFINE_ALIAS
+from ...fluid.layers import row_conv  #DEFINE_ALIAS
+from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
+
+from ...fluid.input import embedding  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
old mode 100644
new mode 100755
index 21cae803716a9af8cd040c47f147a02093b21137..0fed32a1676759bd94961af0a8949d035ec48c8f
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -22,9 +22,7 @@ from __future__ import print_function
 from .random import randperm
 from .attribute import rank  #DEFINE_ALIAS
 from .attribute import shape  #DEFINE_ALIAS
-from .creation import create_tensor  #DEFINE_ALIAS
-# from .creation import create_lod_tensor        #DEFINE_ALIAS
-# from .creation import create_random_int_lodtensor        #DEFINE_ALIAS
+from .creation import to_tensor  #DEFINE_ALIAS
 from .creation import crop_tensor  #DEFINE_ALIAS
 from .creation import diag  #DEFINE_ALIAS
 from .creation import eye  #DEFINE_ALIAS
@@ -60,7 +58,7 @@ from .logic import equal  #DEFINE_ALIAS
 from .logic import greater_equal  #DEFINE_ALIAS
 from .logic import greater_than  #DEFINE_ALIAS
 from .logic import is_empty  #DEFINE_ALIAS
-from .logic import isfinite  #DEFINE_ALIAS
+#from .logic import isfinite  #DEFINE_ALIAS
 from .logic import less_equal  #DEFINE_ALIAS
 from .logic import less_than  #DEFINE_ALIAS
 from .logic import logical_and  #DEFINE_ALIAS
@@ -76,7 +74,9 @@ from .logic import equal_all  #DEFINE_ALIAS
 from .manipulation import cast  #DEFINE_ALIAS
 from .manipulation import concat  #DEFINE_ALIAS
 from .manipulation import expand  #DEFINE_ALIAS
+from .manipulation import broadcast_to  #DEFINE_ALIAS
 from .manipulation import expand_as  #DEFINE_ALIAS
+from .manipulation import tile  #DEFINE_ALIAS
 from .manipulation import flatten  #DEFINE_ALIAS
 from .manipulation import gather  #DEFINE_ALIAS
 from .manipulation import gather_nd  #DEFINE_ALIAS
@@ -99,6 +99,7 @@ from .manipulation import unstack  #DEFINE_ALIAS
 from .manipulation import flip  #DEFINE_ALIAS
 from .manipulation import unbind  #DEFINE_ALIAS
 from .manipulation import roll  #DEFINE_ALIAS
+from .manipulation import chunk  #DEFINE_ALIAS
 from .math import abs  #DEFINE_ALIAS
 from .math import acos  #DEFINE_ALIAS
 from .math import asin  #DEFINE_ALIAS
@@ -110,8 +111,7 @@ from .math import cumsum  #DEFINE_ALIAS
 from .math import elementwise_add  #DEFINE_ALIAS
 from .math import elementwise_div  #DEFINE_ALIAS
 from .math import elementwise_floordiv  #DEFINE_ALIAS
-from .math import elementwise_max  #DEFINE_ALIAS
-from .math import elementwise_min  #DEFINE_ALIAS
+from .math import elementwise_mul  #DEFINE_ALIAS
 from .math import elementwise_mod  #DEFINE_ALIAS
 from .math import elementwise_pow  #DEFINE_ALIAS
 from .math import elementwise_sub  #DEFINE_ALIAS
@@ -140,9 +140,15 @@ from .math import sums  #DEFINE_ALIAS
 from .math import tanh  #DEFINE_ALIAS
 from .math import elementwise_sum  #DEFINE_ALIAS
 from .math import max  #DEFINE_ALIAS
+from .math import maximum  #DEFINE_ALIAS
 from .math import min  #DEFINE_ALIAS
+from .math import minimum  #DEFINE_ALIAS
 from .math import mm  #DEFINE_ALIAS
-from .math import div  #DEFINE_ALIAS
+from .math import divide  #DEFINE_ALIAS
+from .math import floor_divide  #DEFINE_ALIAS
+from .math import remainder  #DEFINE_ALIAS
+from .math import mod  #DEFINE_ALIAS
+from .math import floor_mod  #DEFINE_ALIAS
 from .math import multiply  #DEFINE_ALIAS
 from .math import add  #DEFINE_ALIAS
 from .math import atan  #DEFINE_ALIAS
@@ -152,11 +158,16 @@ from .math import log1p  #DEFINE_ALIAS
 from .math import erf  #DEFINE_ALIAS
 from .math import addcmul  #DEFINE_ALIAS
 from .math import addmm  #DEFINE_ALIAS
-from .math import clamp  #DEFINE_ALIAS
+from .math import clip  #DEFINE_ALIAS
 from .math import trace  #DEFINE_ALIAS
 from .math import kron  #DEFINE_ALIAS
-# from .random import gaussin        #DEFINE_ALIAS
-# from .random import uniform        #DEFINE_ALIAS
+from .math import isfinite  #DEFINE_ALIAS
+from .math import isinf  #DEFINE_ALIAS
+from .math import isnan  #DEFINE_ALIAS
+from .math import prod  #DEFINE_ALIAS
+from .random import standard_normal
+from .random import normal
+from .random import uniform  #DEFINE_ALIAS
 from .random import shuffle  #DEFINE_ALIAS
 from .random import randn  #DEFINE_ALIAS
 from .random import rand  #DEFINE_ALIAS
@@ -174,10 +185,12 @@ from .search import index_select  #DEFINE_ALIAS
 from .search import nonzero  #DEFINE_ALIAS
 from .search import sort  #DEFINE_ALIAS
 from .search import index_sample  #DEFINE_ALIAS
+from .search import masked_select  #DEFINE_ALIAS
 from .stat import mean  #DEFINE_ALIAS
 from .stat import reduce_mean  #DEFINE_ALIAS
 from .stat import std  #DEFINE_ALIAS
 from .stat import var  #DEFINE_ALIAS
+from .stat import numel  #DEFINE_ALIAS
 # from .tensor import Tensor        #DEFINE_ALIAS
 # from .tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor import LoDTensorArray        #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 02c908be347ab00ce29babd01f227e8367e259f2..cb3caf0656e8fd4aba905feed92f10238d1fc9d0 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 
 from __future__ import print_function
+import numpy as np
+
 from ..fluid.framework import Variable
+from ..fluid.framework import unique_name
+from ..fluid.framework import _current_expected_place
+from ..fluid.framework import dygraph_only
 from ..fluid.initializer import Constant
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
@@ -21,23 +26,17 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
 from ..fluid.layers import fill_constant
 from paddle.common_ops_import import *
-import paddle
 
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import crop_tensor  #DEFINE_ALIAS
-from ..fluid.layers import diag  #DEFINE_ALIAS
 from ..fluid.layers import fill_constant  #DEFINE_ALIAS
-from ..fluid.layers import create_tensor  #DEFINE_ALIAS
 from ..fluid.layers import linspace  #DEFINE_ALIAS
 import paddle
 
 __all__ = [
-    'create_tensor',
-    #       'create_lod_tensor',
-    #       'create_random_int_lodtensor',
+    'to_tensor',
     'crop_tensor',
     'diag',
-    'eye',
     'fill_constant',
     #       'get_tensor_from_selected_rows',
     'linspace',
@@ -55,10 +54,172 @@ __all__ = [
 ]
 
 
+@dygraph_only
+def to_tensor(data, dtype=None, place=None, stop_gradient=True):
+    """
+    Constructs a ``paddle.Tensor`` or ``paddle.ComplexTensor`` from ``data`` , 
+    which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
+
+    If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
+    will be performed and return origin tensor, otherwise a new tensor will be constructed
+    and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype`` 
+    and the current place is cpu, no copy will be performed.
+
+    The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then 
+    ``x.real`` is the real part, and ``x.imag`` is the imaginary part.
+
+    Args:
+        data(scalar|tuple|list|ndarray|Tensor|ComplexTensor): Initial data for the tensor.
+            Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
+        dtype(str, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+            'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
+            'complex64' , 'complex128' only for ComplexTensor.
+            Default: None, infers data type from ``data`` .
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
+        stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
+
+    Returns:
+        Tensor: A Tensor or ComplexTensor constructed from ``data``.
+
+    Raises:
+        TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor, paddle.ComplexTensor
+        ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
+        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
+        ValueError: If ``place`` is not paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+        paddle.enable_imperative()
+                
+        type(paddle.to_tensor(1))
+        # <class 'paddle.Tensor'>
+
+        paddle.to_tensor(1)
+        # Tensor: generated_tensor_0
+        # - place: CUDAPlace(0)   # allocate on global default place CPU:0
+        # - shape: [1]
+        # - layout: NCHW
+        # - dtype: int64_t
+        # - data: [1]
+
+        x = paddle.to_tensor(1)
+        paddle.to_tensor(x, dtype='int32', place=paddle.CPUPlace()) # A new tensor will be constructed due to different dtype or place
+        # Tensor: generated_tensor_01
+        # - place: CPUPlace
+        # - shape: [1]
+        # - layout: NCHW
+        # - dtype: int
+        # - data: [1]
+
+        paddle.to_tensor((1.1, 2.2), place=paddle.CUDAPinnedPlace())
+        # Tensor: generated_tensor_1
+        #   - place: CUDAPinnedPlace
+        #   - shape: [2]
+        #   - layout: NCHW
+        #   - dtype: double
+        #   - data: [1.1 2.2]
+
+        paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CUDAPlace(0), stop_gradient=False)
+        # Tensor: generated_tensor_2
+        #   - place: CUDAPlace(0)
+        #   - shape: [2, 2]
+        #   - layout: NCHW
+        #   - dtype: double
+        #   - data: [0.1 0.2 0.3 0.4]
+
+        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), , dtype='complex64')
+        # <class 'paddle.ComplexTensor'>
+
+        paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
+        # ComplexTensor[real]: generated_tensor_0.real
+        #   - place: CUDAPlace(0)
+        #   - shape: [2, 2]
+        #   - layout: NCHW
+        #   - dtype: float
+        #   - data: [1 2 3 4]
+        # ComplexTensor[imag]: generated_tensor_0.imag
+        #   - place: CUDAPlace(0)
+        #   - shape: [2, 2]
+        #   - layout: NCHW
+        #   - dtype: float
+        #   - data: [1 0 2 0]
+    """
+
+    if place is None:
+        place = _current_expected_place()
+    elif not isinstance(place,
+                        (core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)):
+        raise ValueError(
+            "'place' must be any of paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
+        )
+
+    #Todo(zhouwei): Support allocate tensor on any other specified card
+    if isinstance(place, core.CUDAPlace) and isinstance(
+            _current_expected_place(), core.CUDAPlace) and place._get_device_id(
+            ) != _current_expected_place()._get_device_id():
+        place = _current_expected_place()
+
+    if not isinstance(data, np.ndarray):
+        if np.isscalar(data) and not isinstance(data, str):
+            data = np.array([data])
+        elif isinstance(data, (list, tuple)):
+            data = np.array(data)
+            if data.dtype == np.object:
+                raise ValueError(
+                    "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
+                    "this means the input data contains nested lists with different lengths. "
+                )
+        elif isinstance(data, paddle.Tensor):
+            data.stop_gradient = stop_gradient
+            if not data.place._equals(place):
+                data = data._copy_to(place, False)
+            if dtype:
+                if convert_dtype(dtype) != convert_dtype(data.dtype):
+                    return data.astype(convert_dtype(dtype))
+            return data
+        elif isinstance(data, paddle.ComplexTensor):
+            return data
+        else:
+            raise TypeError(
+                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
+                format(type(data)))
+
+    if dtype:
+        dtype = convert_dtype(dtype)
+        if dtype != data.dtype:
+            data = data.astype(dtype)
+
+    if not np.iscomplexobj(data):
+        return paddle.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=True,
+            stop_gradient=stop_gradient)
+    else:
+        name = unique_name.generate('generated_tensor')
+        real_tensor = paddle.Tensor(
+            value=data.real,
+            place=place,
+            zero_copy=True,
+            name=name + ".real",
+            stop_gradient=stop_gradient)
+        imag_tensor = paddle.Tensor(
+            value=data.imag,
+            place=place,
+            zero_copy=True,
+            name=name + ".imag",
+            stop_gradient=stop_gradient)
+        return paddle.ComplexTensor(real_tensor, imag_tensor)
+
+
 def full_like(x, fill_value, dtype=None, name=None):
     """
-	:alias_main: paddle.full_like
-	:alias: paddle.tensor.full_like, paddle.tensor.creation.full_like
 
     This function creates a tensor filled with ``fill_value`` which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
@@ -66,7 +227,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         fill_value(bool|float|int): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output. The data type can be one
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
             of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
             data type is the same as input.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
@@ -84,7 +245,7 @@ def full_like(x, fill_value, dtype=None, name=None):
           import paddle
           import numpy as np
           
-          paddle.enable_imperative()  # Now we are in imperative mode 
+          paddle.disable_static()  # Now we are in imperative mode 
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
           # [[2. 2. 2.]
@@ -121,14 +282,12 @@ def full_like(x, fill_value, dtype=None, name=None):
 
 def ones(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.ones
-	:alias: paddle.tensor.ones, paddle.tensor.creation.ones
 
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 1.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape is int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output Tensor, it supports
+        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64. Default: if None, the data type is 'float32'.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
     
@@ -144,7 +303,7 @@ def ones(shape, dtype=None, name=None):
         .. code-block:: python
 
           import paddle 
-          paddle.enable_imperative()
+          paddle.disable_static()
           
           # default dtype for ones OP
           data1 = paddle.ones(shape=[3, 2]) 
@@ -197,14 +356,14 @@ def ones_like(x, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.enable_imperative()
+            paddle.disable_static()
 
-        x = paddle.imperative.to_variable(np.array([1,2,3], dtype='float32'))
-        out1 = paddle.zeros_like(x) # [1., 1., 1.]
-        out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
+            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            out1 = paddle.zeros_like(x) # [1., 1., 1.]
+            out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
     """
     return full_like(x=x, fill_value=1, dtype=dtype, name=name)
@@ -212,14 +371,11 @@ def ones_like(x, dtype=None, name=None):
 
 def zeros(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.zeros
-	:alias: paddle.tensor.zeros, paddle.tensor.creation.zeros
-
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of ``shape`` is int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output Tensor, it supports
+        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64. Default: if None, the date type is float32.
         name(str, optional): The default value is None.  Normally there is no need for user to set this
             property.  For more information, please refer to :ref:`api_guide_Name`.
@@ -237,7 +393,7 @@ def zeros(shape, dtype=None, name=None):
 
           import paddle
           
-          paddle.enable_imperative()  # Now we are in imperative mode
+          paddle.disable_static()  # Now we are in imperative mode
           data = paddle.zeros(shape=[3, 2], dtype='float32') 
           # [[0. 0.]
           #  [0. 0.]
@@ -287,14 +443,14 @@ def zeros_like(x, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.enable_imperative()
+            paddle.disable_static()
 
-        x = paddle.imperative.to_variable(np.array([1,2,3], dtype='float32'))
-        out1 = paddle.zeros_like(x) # [0., 0., 0.]
-        out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
+            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            out1 = paddle.zeros_like(x) # [0., 0., 0.]
+            out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
     """
     return full_like(x=x, fill_value=0, dtype=dtype, name=name)
@@ -302,8 +458,6 @@ def zeros_like(x, dtype=None, name=None):
 
 def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-	:alias_main: paddle.eye
-	:alias: paddle.tensor.eye, paddle.tensor.creation.eye
     
     This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
@@ -311,7 +465,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
         num_rows(int): the number of rows in each batch Tensor.
         num_columns(int, optional): the number of columns in each batch Tensor.
             If None, default: num_rows.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned Tensor.
+        dtype(np.dtype|str, optional): The data type of the returned Tensor.
             It should be int32, int64, float16, float32, float64. Default: if None, the data type
             is float32.
         name(str, optional): The default value is None.  Normally there is no need for 
@@ -329,7 +483,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
           
           import paddle
 
-          paddle.enable_imperative()  # Now we are in imperative mode
+          paddle.disable_static()  # Now we are in imperative mode
           data = paddle.eye(3, dtype='int32')
           # [[1 0 0]
           #  [0 1 0]
@@ -352,8 +506,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
 def full(shape, fill_value, dtype=None, name=None):
     """
-	:alias_main: paddle.full
-	:alias: paddle.tensor.full, paddle.tensor.creation.full
 
     This Op return a Tensor with the ``fill_value`` which size is same as ``shape``.
     
@@ -364,7 +516,7 @@ def full(shape, fill_value, dtype=None, name=None):
                 If ``shape`` is an Tensor, it should be an 1-D Tensor .
         fill_value(bool|float|int|Tensor): The constant value
             used to initialize the Tensor to be created. If ``fill_value`` is an Tensor, it must be an 1-D Tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output Tensor
+        dtype(np.dtype|str, optional): Data type of the output Tensor
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created Tensor is `float32`
         name(str, optional): The default value is None.  Normally there is no need for user to set this
@@ -383,7 +535,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
           import paddle
 
-          paddle.enable_imperative()  # Now we are in imperative mode
+          paddle.disable_static()  # Now we are in imperative mode
           data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
           #[[0]
           # [0]]
@@ -460,7 +612,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         import paddle
         import numpy as np
 
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         out1 = paddle.arange(5)
         # [0, 1, 2, 3, 4]
@@ -472,7 +624,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out3 = paddle.arange(4.999, dtype='float32')
         # [0., 1., 2., 3., 4.]
 
-        start_var = paddle.imperative.to_variable(np.array([3]))
+        start_var = paddle.to_tensor(np.array([3]))
         out4 = paddle.arange(start_var, 7)
         # [3, 4, 5, 6]
              
@@ -562,9 +714,9 @@ def tril(x, diagonal=0, name=None):
             #        [ 5,  6,  7,  8],
             #        [ 9, 10, 11, 12]])
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
-            x = paddle.imperative.to_variable(data)
+            x = paddle.to_variable(data)
             
             tril1 = paddle.tensor.tril(x)
             # array([[ 1,  0,  0,  0],
@@ -633,10 +785,10 @@ def triu(x, diagonal=0, name=None):
             #        [ 5,  6,  7,  8],
             #        [ 9, 10, 11, 12]])
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             # example 1, default diagonal
-            x = paddle.imperative.to_variable(data)
+            x = paddle.to_variable(data)
             triu1 = paddle.tensor.triu(x)
             # array([[ 1,  2,  3,  4],
             #        [ 0,  6,  7,  8],
@@ -710,12 +862,12 @@ def meshgrid(*args, **kwargs):
           import paddle
           import numpy as np
           
-          paddle.enable_imperative()
+          paddle.disable_static()
 
           input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
           input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
-          tensor_3 = paddle.imperative.to_variable(input_3)
-          tensor_4 = paddle.imperative.to_variable(input_4)
+          tensor_3 = paddle.to_tensor(input_3)
+          tensor_4 = paddle.to_tensor(input_4)
           grid_x, grid_y = paddle.tensor.meshgrid(tensor_3, tensor_4)
 
           #the shape of grid_x is (100, 200)
@@ -750,3 +902,99 @@ def meshgrid(*args, **kwargs):
         type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out})
 
     return out
+
+
+def diag(x, offset=0, padding_value=0, name=None):
+    """
+    If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+    If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
+
+    The argument ``offset`` controls the diagonal offset:
+
+    If ``offset`` = 0, it is the main diagonal.
+
+    If ``offset`` > 0, it is superdiagonal.
+
+    If ``offset`` < 0, it is subdiagonal.
+
+    Args:
+        x (Tensor): The input tensor. Its shape is either 1-D or 2-D. Its data type should be float32, float64, int32, int64.
+        offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal.
+        padding_value (int|float, optional): Use this value to fill the area outside the specified diagonal band. Only takes effect when the input is a 1-D Tensor. The default value is 0.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, a square matrix or a vector. The output data type is the same as input data type.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+
+          paddle.disable_static()
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.diag(x)
+          print(y.numpy())
+          # [[1 0 0]
+          #  [0 2 0]
+          #  [0 0 3]]
+
+          y = paddle.diag(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0]
+          #  [0 0 2 0]
+          #  [0 0 0 3]
+          #  [0 0 0 0]]
+
+          y = paddle.diag(x, padding_value=6)
+          print(y.numpy())
+          # [[1 6 6]
+          #  [6 2 6]
+          #  [6 6 3]]
+
+        .. code-block:: python
+
+          import paddle
+
+          paddle.disable_static()
+          x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+          y = paddle.diag(x)
+          print(y.numpy())
+          # [1 5]
+
+          y = paddle.diag(x, offset=1)
+          print(y.numpy())
+          # [2 6]
+
+          y = paddle.diag(x, offset=-1)
+          print(y.numpy())
+          # [4]
+    """
+    if in_dygraph_mode():
+        return core.ops.diag_v2(x, "offset", offset, "padding_value",
+                                padding_value)
+
+    check_type(x, 'x', (Variable), 'diag_v2')
+    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
+                'diag_v2')
+    check_type(offset, 'offset', (int), 'diag_v2')
+    check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
+    if len(x.shape) != 1 and len(x.shape) != 2:
+        raise ValueError(
+            "The dimension of input x must be either 1 or 2, but received {}".
+            format(len(x.shape)))
+
+    helper = LayerHelper("diag_v2", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='diag_v2',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'offset': offset,
+               'padding_value': padding_value})
+
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index fcff5585bc12a75f274bd29236648d5b201a2f2d..b5b528325cd9f52a8b61ef21df0095c41da5a8ed 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 from paddle.common_ops_import import *
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
@@ -35,135 +36,134 @@ __all__ = [
 ]
 
 
-def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
+def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-	:alias_main: paddle.matmul
-	:alias: paddle.matmul,paddle.tensor.matmul,paddle.tensor.linalg.matmul
+    Applies matrix multiplication to two tensors. `matmul` follows 
+    the complete broadcast rules, 
+    and its behavior is consistent with `np.matmul`.
 
-    Applies matrix multiplication to two tensors.
-
-    Currently, the input tensors' rank can be any, but when the rank of any
-    inputs is bigger than 3, this two inputs' rank should be equal.
+    Currently, the input tensors' number of dimensions can be any, `matmul` can be used to
+    achieve the `dot`, `matmul` and `batchmatmul`.
 
     The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
     - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
-      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
-      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
-      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
-      :math:`[1, D]` in transposed form.
-
-    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
-      performs in the following way.
-
-      - If both are 2-D, they are multiplied like conventional matrices.
-      - If either is n-D, it is treated as a stack of matrices residing in the
-        last two dimensions and a batched matrix multiply supporting broadcast
-        applies on the two tensors.
-
-    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
-    nontransposed, the prepended or appended dimension :math:`1` will be
-    removed after matrix multiplication.
+      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor 
+      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas 
+      for :math:`y` it is the opposite: It is treated as :math:`[D, 1]`.
+
+    The multiplication behavior depends on the dimensions of `x` and `y`. Specifically:
+
+    - If both tensors are 1-dimensional, the dot product result is obtained.
+
+    - If both tensors are 2-dimensional, the matrix-matrix product is obtained.
+
+    - If the `x` is 1-dimensional and the `y` is 2-dimensional, 
+      a `1` is prepended to its dimension in order to conduct the matrix multiply. 
+      After the matrix multiply, the prepended dimension is removed.
+      
+    - If the `x` is 2-dimensional and `y` is 1-dimensional, 
+      the matrix-vector product is obtained.
+
+    - If both arguments are at least 1-dimensional and at least one argument 
+      is N-dimensional (where N > 2), then a batched matrix multiply is obtained. 
+      If the first argument is 1-dimensional, a 1 is prepended to its dimension 
+      in order to conduct the batched matrix multiply and removed after. 
+      If the second argument is 1-dimensional, a 1 is appended to its 
+      dimension for the purpose of the batched matrix multiple and removed after. 
+      The non-matrix (exclude the last two dimensions) dimensions are 
+      broadcasted according the broadcast rule. 
+      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor, 
+      out will be a (j, k, n, p) tensor.
 
     Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a Tensor or LoDTensor.
+        x (Tensor): The input tensor which is a Tensor.
+        y (Tensor): The input tensor which is a Tensor.
         transpose_x (bool): Whether to transpose :math:`x` before multiplication.
         transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        alpha (float): The scale of output. Default 1.0.
         name(str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
 
     Returns:
-        Variable: The product Tensor (or LoDTensor) variable.
+        Tensor: The output Tensor.
 
     Examples:
-        .. code-block:: python
-
-            # Examples to clarify shapes of the inputs and output
-            # x: [B, ..., M, K], y: [B, ..., K, N]
-            # paddle.matmul(x, y)  # out: [B, ..., M, N]
-
-            # x: [B, M, K], y: [B, K, N]
-            # paddle.matmul(x, y)  # out: [B, M, N]
-
-            # x: [B, M, K], y: [K, N]
-            # paddle.matmul(x, y)  # out: [B, M, N]
 
-            # x: [M, K], y: [K, N]
-            # paddle.matmul(x, y)  # out: [M, N]
-
-            # x: [B, M, K], y: [K]
-            # paddle.matmul(x, y)  # out: [B, M]
+    .. code-block:: python
 
-            # x: [K], y: [K]
-            # paddle.matmul(x, y)  # out: [1]
+        import paddle
+        import numpy as np
 
-            # x: [M], y: [N]
-            # paddle.matmul(x, y, True, True)  # out: [M, N]
+        paddle.disable_static()
+        # vector * vector
+        x_data = np.random.random([10]).astype(np.float32)
+        y_data = np.random.random([10]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [1]
+
+        # matrix * vector
+        x_data = np.random.random([10, 5]).astype(np.float32)
+        y_data = np.random.random([5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10]
+
+        # batched matrix * broadcasted vector
+        x_data = np.random.random([10, 5, 2]).astype(np.float32)
+        y_data = np.random.random([2]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 5]
+
+        # batched matrix * batched matrix
+        x_data = np.random.random([10, 5, 2]).astype(np.float32)
+        y_data = np.random.random([10, 2, 5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 5, 5]
+
+        # batched matrix * broadcasted matrix
+        x_data = np.random.random([10, 1, 5, 2]).astype(np.float32)
+        y_data = np.random.random([1, 3, 2, 5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 3, 5, 5]
 
-            import paddle
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3], dtype='float32')
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-            out = paddle.matmul(x, y, True, True)
     """
+    op_type = 'matmul_v2'
+    if in_dygraph_mode():
+        op = getattr(core.ops, op_type)
+        return op(x, y, 'trans_x', transpose_x, 'trans_y', transpose_y)
+
     attrs = {
-        'transpose_X': transpose_x,
-        'transpose_Y': transpose_y,
-        'alpha': float(alpha),
+        'trans_x': transpose_x,
+        'trans_y': transpose_y,
     }
 
-    if in_dygraph_mode():
-        out = _varbase_creator(dtype=x.dtype)
-        core.ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
-                        transpose_y, 'alpha', float(alpha))
-        return out
-
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul')
-        x_shape = list(x.shape)
-        y_shape = list(y.shape)
-        if len(x_shape) == 1:
-            x_shape = [1] + x_shape
-        if len(y_shape) == 1:
-            y_shape = y_shape + [1]
-
-        # check the inner 2 dimensions
-        if transpose_x:
-            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
-        if transpose_y:
-            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
-        if x_shape[-1] != y_shape[-2]:
-            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
-                "After performing an optional transpose, Input X's width should be "   \
-                "equal to Y's width for multiplication "                               \
-                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
-                (x_shape, y_shape)
-
-        if len(y_shape) > 2 and len(x_shape) > 2:
-            for i, dim_x in enumerate(x_shape[:-2]):
-                # don't check neg shape
-                if dim_x < 0 or y_shape[i] < 0:
-                    continue
-                if dim_x != y_shape[i]:
-                    raise ValueError(
-                        "When the matrix is larger than 2 dimensions, the higher "
-                        "dimensional values of the two matrices need to be equal. "
-                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
-                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+            check_variable_and_dtype(val, name, ['float32', 'float64'],
+                                     'matmul')
 
     __check_input(x, y)
 
-    helper = LayerHelper('matmul', **locals())
+    helper = LayerHelper('matmul_v2', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
-        type='matmul',
+        type='matmul_v2',
         inputs={'X': x,
                 'Y': y},
         outputs={'Out': out},
@@ -171,7 +171,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     return out
 
 
-def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
+def norm(x, p='fro', axis=None, keepdim=False, name=None):
     """
 	:alias_main: paddle.norm
 	:alias: paddle.norm,paddle.tensor.norm,paddle.tensor.linalg.norm
@@ -180,20 +180,19 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
     Args:
-        input (Variable): The input tensor could be N-D tensor, and the input data
+        x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
-        p (float|string, optional): Order of the norm. Supported values are `fro`, `1`, `2`,
-            and any positive real number yielding the corresponding p-norm.
-        axis (int|list, optional): The axis on which to apply norm operation. If axis is int
-            or list with only one element, the vector norm is computed over the axis.
-            If axis is a list with two elements, the matrix norm is computed over the axis.
+        p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
+           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
+            Not supported: ord < 0, nuclear norm.
+        axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
+            or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
+            If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
             value is False.
-        out (Variable, optional): The output tensor, default value is None. It's data type
-            must be the same as the input Tensor.
         name (str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -209,29 +208,57 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
         .. code-block:: python
             
             import paddle
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float64')
-            
+            import numpy as np
+            paddle.disable_static()
+            shape=[2, 3, 4]
+            np_input = np.arange(24).astype('float32') - 12
+            np_input = np_input.reshape(shape)
+            x = paddle.to_tensor(np_input)
+            #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
+            # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+
             # compute frobenius norm along last two dimensions.
-            out_fro = paddle.norm(x, p='fro', axis=[1,2])
-            
+            out_fro = paddle.norm(x, p='fro', axis=[0,1])
+            # out_fro.numpy() [17.435596 16.911535 16.7332   16.911535]
+
             # compute 2-order vector norm along last dimension.
             out_pnorm = paddle.norm(x, p=2, axis=-1)
+            #out_pnorm.numpy(): [[21.118711  13.190906   5.477226]
+            #                    [ 3.7416575 11.224972  19.131126]]
+
+            # compute 2-order  norm along [0,1] dimension.
+            out_pnorm = paddle.norm(x, p=2, axis=[0,1])
+            #out_pnorm.numpy(): [17.435596 16.911535 16.7332   16.911535]
+
+            # compute inf-order  norm
+            out_pnorm = paddle.norm(x, p=np.inf)
+            #out_pnorm.numpy()  = [12.]
+            out_pnorm = paddle.norm(x, p=np.inf, axis=0)
+            #out_pnorm.numpy(): [[12. 11. 10. 9.] [8. 7. 6. 7.] [8. 9. 10. 11.]]
+
+            # compute -inf-order  norm
+            out_pnorm = paddle.norm(x, p=-np.inf)
+            #out_pnorm.numpy(): [0.]
+            out_pnorm = paddle.norm(x, p=-np.inf, axis=0)
+            #out_pnorm.numpy(): [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
     """
 
-    def frobenius_norm(input, dim=None, keepdim=False, out=None, name=None):
+    def frobenius_norm(input, dim=None, keepdim=False, name=None):
         """
         The frobenius norm OP is to calculate the frobenius norm of certain two dimensions of Tensor `input`.
         Args:
           input (Variable): Tensor, data type float32, float64.
           dim (list, optional): None for last two dimensions.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
-          out (Variable, optional): The tensor variable storing the output.
         """
         if dim is not None and not (isinstance(dim, list) and len(dim) == 2):
             raise ValueError(
                 "The dim of frobenius norm op should be None or two elements list!"
             )
+        if in_dygraph_mode():
+            if dim is None: dim = [-1]
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
+                                           keepdim)
         attrs = {
             'dim': dim if dim != None else [-2, -1],
             'keep_dim': keepdim,
@@ -243,16 +270,8 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
                                  'frobenius_norm')
 
         helper = LayerHelper('frobenius_norm', **locals())
-        if out is None:
-            out = helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype())
-        else:
-            check_type(out, 'out', (Variable), 'frobenius_norm')
-            check_dtype(
-                out.dtype, out.name,
-                convert_dtype(input.dtype), 'frobenius_norm',
-                '(The out data type in frobenius_norm must be the same with input data type.)'
-            )
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
 
         helper.append_op(
             type='frobenius_norm',
@@ -265,7 +284,7 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
                     porder=None,
                     axis=None,
                     keepdim=False,
-                    out=None,
+                    asvector=False,
                     name=None):
         """
         Calculate the p-order vector norm for certain  dimension of Tensor `input`.
@@ -274,32 +293,28 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
           porder (float, optional): None for porder=2.0.
           axis (int, optional): None for last dimension.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
-          out (Variable, optional): The tensor variable storing the output.
         """
+        if in_dygraph_mode():
+            if axis is None: axis = -1
+            return core.ops.p_norm(input, 'porder', porder, 'axis', axis,
+                                   'keepdim', keepdim, 'asvector', asvector)
         if porder is not None:
             check_type(porder, 'porder', (float, int), 'p_norm')
         if axis is not None:
             check_type(axis, 'axis', (int), 'p_norm')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'p_norm')
+
         attrs = {
             'axis': axis if axis is not None else -1,
             'porder': float(porder) if porder is not None else 2.0,
             'keepdim': keepdim,
+            'asvector': asvector,
             'epsilon': 1e-12,
         }
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'p_norm')
-
         helper = LayerHelper('p_norm', **locals())
-        if out is None:
-            out = helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype())
-        else:
-            check_type(out, 'out', (Variable), 'p_norm')
-            check_dtype(
-                out.dtype, out.name,
-                convert_dtype(input.dtype), 'p_norm',
-                '(The out data type in p_norm must be the same with input data type.)'
-            )
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
 
         helper.append_op(
             type='p_norm',
@@ -308,21 +323,126 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
             attrs=attrs)
         return out
 
+    def inf_norm(input,
+                 porder=None,
+                 axis=axis,
+                 keepdim=False,
+                 asvector=False,
+                 name=None):
+        helper = LayerHelper('frobenius_norm', **locals())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(type='abs', inputs={'X': input}, outputs={'Out': out})
+        reduce_out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+
+        reduce_all = True if axis == None or axis == [] or asvector == True else False
+        axis = axis if axis != None and axis != [] else [0]
+
+        reduce_type = 'reduce_max' if porder == np.float(
+            'inf') else 'reduce_min'
+        helper.append_op(
+            type=reduce_type,
+            inputs={'X': out},
+            outputs={'Out': reduce_out},
+            attrs={'dim': axis,
+                   'keep_dim': keepdim,
+                   'reduce_all': reduce_all})
+
+        return reduce_out
+
+    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
+        block = LayerHelper('norm', **locals())
+        out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+
+        cast_out = block.create_variable_for_type_inference(dtype=bool)
+        block.append_op(
+            type='cast',
+            inputs={'X': input},
+            outputs={'Out': cast_out},
+            attrs={
+                'in_dtype': input.dtype,
+                'out_dtype': int(core.VarDesc.VarType.BOOL)
+            })
+        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
+        block.append_op(
+            type='cast',
+            inputs={'X': cast_out},
+            outputs={'Out': cast_out2},
+            attrs={
+                'in_dtype': cast_out.dtype,
+                'out_dtype': int(core.VarDesc.VarType.FP32)
+            })
+        sum_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': cast_out2},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False
+            })
+        return sum_out
+
+    def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+        block = LayerHelper('norm', **locals())
+        out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        abs_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='abs', inputs={'X': input}, outputs={'Out': abs_out})
+        pow_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+
+        block.append_op(
+            type='pow',
+            inputs={'X': abs_out},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder})
+        sum_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False
+            })
+        porder
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out},
+            outputs={'Out': out},
+            attrs={'factor': float(1. / porder)})
+        return out
+
     if axis is None and p is not None:
         if isinstance(p, str):
             if p == "fro":
-                return frobenius_norm(
-                    input, dim=axis, keepdim=keepdim, out=out, name=name)
+                return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
             else:
                 raise ValueError(
                     "only valid string values are 'fro', found {}".format(p))
         elif isinstance(p, (int, float)):
             return vector_norm(
-                input, porder=p, axis=axis, keepdim=keepdim, out=out, name=name)
+                x,
+                porder=p,
+                axis=axis,
+                keepdim=keepdim,
+                asvector=True,
+                name=name)
         else:
             raise ValueError("only valid p type is string or float, found {}".
                              format(type(p)))
 
+    if isinstance(axis, tuple):
+        axis = list(axis)
     if isinstance(axis, list) and len(axis) == 1:
         axis = axis[0]
 
@@ -330,7 +450,12 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     if isinstance(axis, int):
         if isinstance(p, (int, float)):
             return vector_norm(
-                input, axis=axis, porder=p, keepdim=keepdim, out=out, name=name)
+                x,
+                axis=axis,
+                porder=p,
+                keepdim=keepdim,
+                asvector=False,
+                name=name)
         else:
             raise ValueError(
                 "unspport p for p-order vector norm. except float, found {}".
@@ -338,11 +463,14 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     #calculate matrix norm, where axis is list with two integers
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
-            return frobenius_norm(
-                input, dim=axis, keepdim=keepdim, out=out, name=name)
+            return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
+        elif p == np.inf or p == -np.inf:
+            return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
         else:
-            raise ValueError(
-                "unspport p for matrix norm, expcept 'fro', found {}".format(p))
+            return p_matrix_norm(
+                x, porder=p, axis=axis, keepdim=keepdim, name=name)
     else:
         raise ValueError(
             "except axis type int or list (length of list <=2), found {}".
@@ -452,35 +580,34 @@ def dist(x, y, p=2):
 
 def dot(x, y, name=None):
     """
-	:alias_main: paddle.dot
-	:alias: paddle.dot,paddle.tensor.dot,paddle.tensor.linalg.dot
-
     This operator calculates inner product for vectors.
    
     .. note::
-       Only support 1-d Tensor(vector).
+       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix 
+       is the batch dimension, which means that the vectors of multiple batches are dotted. 
 
     Parameters:
-        x(Variable): 1-D ``Tensor`` or ``LoDTensor``. Its datatype should be ``float32``, ``float64``, ``int32``, ``int64``
-        y(Variable): 1-D ``Tensor`` or ``LoDTensor``. Its datatype soulde be ``float32``, ``float64``, ``int32``, ``int64``
+        x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``
+        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype soulde be ``float32``, ``float64``, ``int32``, ``int64``
         name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
 
     Returns:
-        Variable: the calculated result Tensor/LoDTensor.
+        Variable: the calculated result Tensor.
 
     Examples:
 
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
-        
-        with fluid.dygraph.guard():
-          x = fluid.dygraph.to_variable(np.random.uniform(0.1, 1, [10]).astype(np.float32))
-          y = fluid.dygraph.to_variable(np.random.uniform(1, 3, [10]).astype(np.float32))
-          z = paddle.dot(x, y)
-          print(z.numpy())
+
+        paddle.disable_static()
+        x_data = np.random.uniform(0.1, 1, [10]).astype(np.float32)
+        y_data = np.random.uniform(1, 3, [10]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.dot(x, y)
+        print(z.numpy())
 
     """
     op_type = 'dot'
@@ -605,10 +732,10 @@ def cross(x, y, axis=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            from paddle.imperative import to_variable
+            from paddle import to_variable
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             data_x = np.array([[1.0, 1.0, 1.0],
                                [2.0, 2.0, 2.0],
@@ -651,11 +778,8 @@ def cross(x, y, axis=None, name=None):
     return out
 
 
-def cholesky(x, upper=False):
+def cholesky(x, upper=False, name=None):
     """
-	:alias_main: paddle.cholesky
-	:alias: paddle.cholesky,paddle.tensor.cholesky,paddle.tensor.linalg.cholesky
-
     Computes the Cholesky decomposition of one symmetric positive-definite
     matrix or batches of symmetric positive-definite matrice. 
     
@@ -680,21 +804,22 @@ def cholesky(x, upper=False):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
-            with fluid.dygraph.guard():
-                a = np.random.rand(3, 3)
-                a_t = np.transpose(a, [1, 0])
-                x = np.matmul(a, a_t) + 1e-03
-                x = fluid.dygraph.to_variable(x)
-                out = paddle.cholesky(x, upper=False)
-                print(out.numpy())
-                # [[1.190523   0.         0.        ]
-                #  [0.9906703  0.27676893 0.        ]
-                #  [1.25450498 0.05600871 0.06400121]]
+            paddle.disable_static()
+            a = np.random.rand(3, 3)
+            a_t = np.transpose(a, [1, 0])
+            x_data = np.matmul(a, a_t) + 1e-03
+            x = paddle.to_variable(x_data)
+            out = paddle.cholesky(x, upper=False)
+            print(out.numpy())
+            # [[1.190523   0.         0.        ]
+            #  [0.9906703  0.27676893 0.        ]
+            #  [1.25450498 0.05600871 0.06400121]]
 
     """
+    if in_dygraph_mode():
+        return core.ops.cholesky(x, "upper", upper)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cholesky')
     check_type(upper, 'upper', bool, 'cholesky')
     helper = LayerHelper('cholesky', **locals())
@@ -735,10 +860,10 @@ def bmm(x, y, name=None):
         input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
         input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
 
-        paddle.enable_imperative()
+        paddle.disable_static()
         
-        x = paddle.imperative.to_variable(input1)
-        y = paddle.imperative.to_variable(input2)
+        x = paddle.to_variable(input1)
+        y = paddle.to_variable(input2)
         out = paddle.bmm(x, y)
         #output size: (2, 2, 2)
         #output value:
@@ -782,13 +907,13 @@ def histogram(input, bins=100, min=0, max=0):
         .. code-block:: python
             import paddle
             import numpy as np
-            startup_program = paddle.Program()
-            train_program = paddle.Program()
-            with paddle.program_guard(train_program, startup_program):
+            startup_program = paddle.static.Program()
+            train_program = paddle.static.Program()
+            with paddle.static.program_guard(train_program, startup_program):
                 inputs = paddle.data(name='input', dtype='int32', shape=[2,3])
                 output = paddle.histogram(inputs, bins=5, min=1, max=5)
                 place = paddle.CPUPlace()
-                exe = paddle.Executor(place)
+                exe = paddle.static.Executor(place)
                 exe.run(startup_program)
                 img = np.array([[2, 4, 2], [2, 5, 4]]).astype(np.int32)
                 res = exe.run(train_program,
@@ -800,11 +925,12 @@ def histogram(input, bins=100, min=0, max=0):
         .. code-block:: python
             import paddle
             import numpy as np
-            with paddle.imperative.guard(paddle.CPUPlace()):
-                inputs_np = np.array([1, 2, 1]).astype(np.float)
-                inputs = paddle.imperative.to_variable(inputs_np)
-                result = paddle.histogram(inputs, bins=4, min=0, max=3)
-                print(result) # [0, 2, 1, 0]
+            paddle.disable_static(paddle.CPUPlace())
+            inputs_np = np.array([1, 2, 1]).astype(np.float)
+            inputs = paddle.to_variable(inputs_np)
+            result = paddle.histogram(inputs, bins=4, min=0, max=3)
+            print(result) # [0, 2, 1, 0]
+            paddle.enable_static()
     """
     if in_dygraph_mode():
         return core.ops.histogram(input, "bins", bins, "min", min, "max", max)
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 936022dd73b31f2d5839cc7e8698c6757378d874..36b558d597c1ce1333a8f1eec54e2fd2813625e3 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import check_type
+from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
 from .. import fluid
+from ..fluid.framework import in_dygraph_mode
+from paddle.common_ops_import import *
 
 # TODO: define logic functions of a tensor  
 from ..fluid.layers import is_empty  #DEFINE_ALIAS
@@ -71,12 +73,11 @@ def equal_all(x, y, name=None):
 
           import numpy as np
           import paddle
-          import paddle.imperative as imperative
 
-          paddle.enable_imperative()
-          x = imperative.to_variable(np.array([1, 2, 3]))
-          y = imperative.to_variable(np.array([1, 2, 3]))
-          z = imperative.to_variable(np.array([1, 4, 3]))
+          paddle.disable_static()
+          x = paddle.to_variable(np.array([1, 2, 3]))
+          y = paddle.to_variable(np.array([1, 2, 3]))
+          z = paddle.to_variable(np.array([1, 4, 3]))
           result1 = paddle.equal_all(x, y)
           print(result1.numpy()) # result1 = [True ]
           result2 = paddle.equal_all(x, z)
@@ -92,75 +93,70 @@ def equal_all(x, y, name=None):
 
 
 @templatedoc()
-def allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
+def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     """
-	:alias_main: paddle.allclose
-	:alias: paddle.allclose,paddle.tensor.allclose,paddle.tensor.logic.allclose
-
     ${comment}
 
     Args:
-        input(inputtype):{input_comment}.
-        other(othertype):{other_comment}.
-        rtol(rtoltype,optional):{rtol_comment}.
-        atol(atoltype,optional):{atol_comment}.
-        equal_nan(equalnantype,optional):{equal_nan_comment}.
-        name(STR, optional): The default value is None.
-                        Normally there is no need for user to set this property.
-                        For more information, please refer to :ref:`api_guide_Name`.
+        x(Tensor): ${input_comment}.
+        y(Tensor): ${other_comment}.
+        rtol(rtoltype, optional): ${rtol_comment}.
+        atol(atoltype, optional): ${atol_comment}.
+        equal_nan(equalnantype, optional): ${equal_nan_comment}.
+        name (str, optional): Name for the operation. For more information, please
+            refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
-        ${out_comment}.
+        Tensor: ${out_comment}.
+
+    Raises:
+        TypeError: The data type of ``x`` must be one of float32, float64.
+        TypeError: The data type of ``y`` must be one of float32, float64.
+        TypeError: The type of ``rtol`` must be float.
+        TypeError: The type of ``atol`` must be float.
+        TypeError: The type of ``equal_nan`` must be bool.
 
-    Return Type:
-        ${out_type}
-        
     Examples:
         .. code-block:: python
 
           import paddle
-          import paddle.fluid as fluid
           import numpy as np
 
-          use_cuda = fluid.core.is_compiled_with_cuda()
+          paddle.disable_static()
 
-          a = fluid.data(name="a", shape=[2], dtype='float32')
-          b = fluid.data(name="b", shape=[2], dtype='float32')
-
-          result = paddle.allclose(a, b, rtol=1e-05, atol=1e-08,
+          np_x = np.array([10000., 1e-07]).astype("float32")
+          np_y = np.array([10000.1, 1e-08]).astype("float32")
+          x = paddle.to_tensor(np_x)
+          y = paddle.to_tensor(np_y)
+          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          result_nan = paddle.allclose(a, b, rtol=1e-05, atol=1e-08,
+          np_result1 = result1.numpy()
+          # [False]
+          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-
-          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-
-          x = np.array([10000., 1e-07]).astype("float32")
-          y = np.array([10000.1, 1e-08]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([False]), array([False]))
-
-          x = np.array([10000., 1e-08]).astype("float32")
-          y = np.array([10000.1, 1e-09]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([ True]), array([ True]))
-
-          x = np.array([1.0, float('nan')]).astype("float32")
-          y = np.array([1.0, float('nan')]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([False]), array([ True]))
+          np_result2 = result2.numpy()
+          # [False]
+
+          np_x = np.array([1.0, float('nan')]).astype("float32")
+          np_y = np.array([1.0, float('nan')]).astype("float32")
+          x = paddle.to_tensor(np_x)
+          y = paddle.to_tensor(np_y)
+          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+                                  equal_nan=False, name="ignore_nan")
+          np_result1 = result1.numpy()
+          # [False]
+          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+                                      equal_nan=True, name="equal_nan")
+          np_result2 = result2.numpy()
+          # [True]
     """
 
+    if in_dygraph_mode():
+        return core.ops.allclose(x, y, 'rtol', rtol, 'atol', atol, 'equal_nan',
+                                 equal_nan)
+
+    check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
+    check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
     check_type(atol, 'atol', float, 'allclose')
     check_type(equal_nan, 'equal_nan', bool, 'allclose')
@@ -168,7 +164,7 @@ def allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     helper = LayerHelper("allclose", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
 
-    inputs = {'Input': input, 'Other': other}
+    inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': rtol, 'atol': atol, 'equal_nan': equal_nan}
     helper.append_op(
@@ -201,11 +197,10 @@ def equal(x, y, name=None):
 
           import numpy as np
           import paddle
-          import paddle.imperative as imperative
 
-          paddle.enable_imperative()
-          x = imperative.to_variable(np.array([1, 2, 3]))
-          y = imperative.to_variable(np.array([1, 3, 2]))
+          paddle.disable_static()
+          x = paddle.to_variable(np.array([1, 2, 3]))
+          y = paddle.to_variable(np.array([1, 3, 2]))
           result1 = paddle.equal(x, y)
           print(result1.numpy())  # result1 = [True False False]
     """
@@ -234,11 +229,10 @@ def greater_equal(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.greater_equal(x, y)
             print(result1.numpy())  # result1 = [True False True]
     """
@@ -267,11 +261,10 @@ def greater_than(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.greater_than(x, y)
             print(result1.numpy())  # result1 = [False False True]
     """
@@ -301,11 +294,10 @@ def less_equal(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.less_equal(x, y)
             print(result1.numpy())  # result1 = [True True False]
     """
@@ -335,11 +327,10 @@ def less_than(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.less_than(x, y)
             print(result1.numpy())  # result1 = [False True False]
     """
@@ -369,11 +360,10 @@ def not_equal(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.not_equal(x, y)
             print(result1.numpy())  # result1 = [False True True]
     """
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 5b7c8c37b1b0a549f8c15af3e2d6425d5361de03..845d2cf4d199328bbf8d0e03cd3a7a24a61aafd2 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..fluid.layers import core, reshape
+from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -23,17 +23,11 @@ from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
-from ..fluid.layers import expand  #DEFINE_ALIAS
-from ..fluid.layers import expand_as  #DEFINE_ALIAS
-from ..fluid.layers import reshape  #DEFINE_ALIAS
-from ..fluid.layers import scatter  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unique  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
-from ..fluid.layers import gather_nd  #DEFINE_ALIAS
 from ..fluid.layers import scatter_nd_add  #DEFINE_ALIAS
 from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
 from ..fluid.layers import shard_index  #DEFINE_ALIAS
@@ -45,6 +39,7 @@ __all__ = [
     'cast',
     'concat',
     'expand',
+    'broadcast_to',
     'expand_as',
     'flatten',
     'gather',
@@ -57,6 +52,7 @@ __all__ = [
     'shard_index',
     'slice',
     'split',
+    'chunk'
     'squeeze',
     'stack',
     'strided_slice',
@@ -68,6 +64,7 @@ __all__ = [
     'flip',
     'unbind',
     'roll',
+    'tile',
 ]
 
 
@@ -103,16 +100,16 @@ def concat(x, axis=0, name=None):
             import paddle
             import numpy as np
             
-            paddle.enable_imperative()  # Now we are in imperative mode
+            paddle.disable_static()  # Now we are in imperative mode
             in1 = np.array([[1, 2, 3],
                             [4, 5, 6]])
             in2 = np.array([[11, 12, 13],
                             [14, 15, 16]])
             in3 = np.array([[21, 22],
                             [23, 24]])
-            x1 = paddle.imperative.to_variable(in1)
-            x2 = paddle.imperative.to_variable(in2)
-            x3 = paddle.imperative.to_variable(in3)
+            x1 = paddle.to_tensor(in1)
+            x2 = paddle.to_tensor(in2)
+            x3 = paddle.to_tensor(in3)
             zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
             # When the axis is negative, the real axis is (axis + Rank(x))
             # As follow, axis is -1, Rank(x) is 2, the real axis is 1
@@ -156,12 +153,12 @@ def flip(x, axis, name=None):
           import paddle
           import numpy as np
 
-          paddle.enable_imperative()
+          paddle.disable_static()
 
           image_shape=(3, 2, 2)
           x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
           x = x.astype('float32')
-          img = paddle.imperative.to_variable(x)
+          img = paddle.to_variable(x)
           out = paddle.flip(img, [0,1])
 
           print(out) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
@@ -247,13 +244,13 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             image_shape=(2, 3, 4, 4)
             x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
             x = x.astype('float32')
             
-            img = paddle.imperative.to_variable(x)
+            img = paddle.to_variable(x)
             out = paddle.flatten(img, start_axis=1, stop_axis=2)
             # out shape is [2, 12, 4]
     """
@@ -325,8 +322,8 @@ def roll(x, shifts, axis=None, name=None):
             data = np.array([[1.0, 2.0, 3.0],
                              [4.0, 5.0, 6.0],
                              [7.0, 8.0, 9.0]])
-            paddle.enable_imperative()
-            x = paddle.imperative.to_variable(data)
+            paddle.disable_static()
+            x = paddle.to_variable(data)
             out_z1 = paddle.roll(x, shifts=1)
             print(out_z1.numpy())
             #[[9. 1. 2.]
@@ -376,7 +373,7 @@ def roll(x, shifts, axis=None, name=None):
         outputs={'Out': out},
         attrs={'axis': axis,
                'shifts': shifts})
-    out = reshape(out, shape=origin_shape, inplace=True)
+    out = layers.reshape(out, shape=origin_shape, inplace=True)
     return out
 
 
@@ -457,10 +454,10 @@ def stack(x, axis=0, name=None):
             data2 = np.array([[3.0, 4.0]])
             data3 = np.array([[5.0, 6.0]])
 
-            paddle.enable_imperative()
-            x1 = paddle.imperative.to_variable(data1)
-            x2 = paddle.imperative.to_variable(data2)
-            x3 = paddle.imperative.to_variable(data3)
+            paddle.disable_static()
+            x1 = paddle.to_variable(data1)
+            x2 = paddle.to_variable(data2)
+            x3 = paddle.to_variable(data3)
 
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
@@ -474,9 +471,6 @@ def stack(x, axis=0, name=None):
 
 def split(x, num_or_sections, axis=0, name=None):
     """
-	:alias_main: paddle.split
-        :alias: paddle.tensor.split, paddle.tensor.manipulation.split
-    
     Split the input tensor into multiple sub-Tensors.
     
     Args:
@@ -503,10 +497,10 @@ def split(x, num_or_sections, axis=0, name=None):
             import numpy as np
             import paddle
             
-            paddle.enable_imperative()
+            paddle.disable_static()
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
-            x = paddle.imperative.to_variable(x_np)
+            x = paddle.to_tensor(x_np)
 
             out0, out1, out22 = paddle.split(x, num_or_sections=3, axis=1)
             # out0.shape [3, 3, 5]
@@ -595,7 +589,7 @@ def squeeze(x, axis=None, name=None):
 
             import paddle
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             
             x = paddle.rand([5, 1, 10])
             output = paddle.squeeze(x, axis=1)
@@ -612,6 +606,130 @@ def squeeze(x, axis=None, name=None):
     return layers.squeeze(x, axis, name)
 
 
+def unique(x,
+           return_index=False,
+           return_inverse=False,
+           return_counts=False,
+           axis=None,
+           dtype="int64",
+           name=None):
+    """
+    Returns the unique elements of `x` in ascending order.
+
+    Args:
+        x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64.
+        return_index(bool, optional): If True, also return the indices of the input tensor that
+            result in the unique Tensor.
+        return_inverse(bool, optional): If True, also return the indices for where elements in
+            the original input ended up in the returned unique tensor.
+        return_counts(bool, optional): If True, also return the counts for each unique element.
+        axis(int, optional): The axis to apply unique. If None, the input will be flattened.
+            Default: None.
+        dtype(np.dtype|str, optional): The date type of `indices` or `inverse` tensor: int32 or int64.
+            Default: int64.
+        name(str, optional): Name for the operation. For more information, please refer to
+            :ref:`api_guide_Name`. Default: None.
+
+    Returns: 
+        tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
+            provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
+            is True. `counts` is provided only if `return_counts` is True.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([2, 3, 3, 1, 5, 3])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [1 2 3 5]
+            _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
+            np_indices = indices.numpy() # [3 0 1 4]
+            np_inverse = inverse.numpy() # [1 2 2 0 3 2]
+            np_counts = counts.numpy() # [1 1 3 1]
+
+            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [0 1 2 3]
+
+            unique = paddle.unique(x, axis=0)
+            np_unique = unique.numpy() 
+            # [[2 1 3]
+            #  [3 0 1]]
+    """
+    if axis is None:
+        axis = []
+    else:
+        axis = [axis]
+    attr_dtype = convert_np_dtype_to_dtype_(dtype)
+    if in_dygraph_mode():
+        out, inverse, indices, counts = core.ops.unique(
+            x, 'dtype', attr_dtype, 'return_index', return_index,
+            'return_inverse', return_inverse, 'return_counts', return_counts,
+            'axis', axis, "is_sorted", True)
+        outs = [out]
+        if return_index:
+            outs.append(indices)
+        if return_inverse:
+            outs.append(inverse)
+        if return_counts:
+            outs.append(counts)
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    check_variable_and_dtype(x, "input",
+                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_type(return_index, 'return_index', bool, 'unique')
+    check_type(return_inverse, 'return_inverse', bool, 'unique')
+    check_type(return_counts, 'return_counts', bool, 'unique')
+    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'unique')
+    if len(axis) != 0:
+        check_type(axis[0], 'axis', int, 'unique')
+
+    helper = LayerHelper('unique', **locals())
+    attrs = {
+        'dtype': attr_dtype,
+        "return_index": return_index,
+        "return_inverse": return_inverse,
+        "return_counts": return_counts,
+        "axis": axis,
+        "is_sorted": True
+    }
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True)
+    outputs = {"Out": out, "Index": inverse}
+    outs = [out]
+    if return_index:
+        indices = helper.create_variable_for_type_inference(
+            dtype=attr_dtype, stop_gradient=True)
+        outputs["Indices"] = indices
+        outs.append(indices)
+    if return_inverse:
+        outs.append(inverse)
+    if return_counts:
+        counts = helper.create_variable_for_type_inference(
+            dtype=attr_dtype, stop_gradient=True)
+        outputs["Counts"] = counts
+        outs.append(counts)
+
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
+
+    if len(outs) == 1:
+        return outs[0]
+
+    return tuple(outs)
+
+
 def unsqueeze(x, axis, name=None):
     """
 	:alias_main: paddle.unsqueeze
@@ -637,7 +755,7 @@ def unsqueeze(x, axis, name=None):
 
             import paddle
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             x = paddle.rand([5, 10])
             print(x.shape)  # [5, 10]
             
@@ -658,50 +776,46 @@ def unsqueeze(x, axis, name=None):
     return layers.unsqueeze(x, axis, name)
 
 
-def gather(input, index, overwrite=True):
+def gather(x, index, axis=None, name=None):
     """
-	:alias_main: paddle.gather
-	:alias: paddle.gather,paddle.tensor.gather,paddle.tensor.manipulation.gather
 
     **Gather Layer**
 
-    Output is obtained by gathering entries of the outer-most dimension
-    of X indexed by `index` and concatenate them together.
-
-    .. math::
-
-        Out = X[Index]
-
+    Output is obtained by gathering entries of ``axis``
+    of ``x`` indexed by ``index`` and concatenate them together.
 
     .. code-block:: text
 
 
                 Given:
 
-                X = [[1, 2],
+                x = [[1, 2],
                      [3, 4],
                      [5, 6]]
 
-                Index = [1, 2]
+                index = [1, 2]
+                axis=[0]
 
                 Then:
 
-                Out = [[3, 4],
+                out = [[3, 4],
                        [5, 6]]
     Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is
+        x (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
             float16 (only for GPU).
-        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
-        overwrite (bool, optional): The mode that updating the grad when has same index.
-            If True, use the overwrite mode to update the grad of the same index,
-            if False, use the accumulate mode to update the grad of the same index.
-            Default value is True.
-
-
+        index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
+        axis (Tensor|int, optional): The axis of input to be gathered, it's can be int or a Tensor with data type is int32 or int64. The default value is None, if None, the ``axis`` is 0.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        output (Variable): The output is a tensor with the same rank as input.
+        output (Tensor): The output is a tensor with the same rank as ``x``.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
+        TypeError: ``axis`` must be a Tensor or int and the data type of ``index`` must be int32 or int64 when it's a Tensor.
 
     Examples:
 
@@ -709,26 +823,41 @@ def gather(input, index, overwrite=True):
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
-
 
-            with fluid.dygraph.guard():
-                input_1 = np.array([[1,2],[3,4],[5,6]])
-                index_1 = np.array([0,1])
-                input = fluid.dygraph.to_variable(input_1)
-                index = fluid.dygraph.to_variable(index_1)
-                output = paddle.gather(input, index)
-                # expected output: [[1,2],[3,4]]
+            paddle.disable_static()
+            input_1 = np.array([[1,2],[3,4],[5,6]])
+            index_1 = np.array([0,1])
+            input = paddle.to_tensor(input_1)
+            index = paddle.to_tensor(index_1)
+            output = paddle.gather(input, index, axis=0)
+            # expected output: [[1,2],[3,4]]
     """
+    if axis is None:
+        axis = 0
+    axis_tensor = axis
+    if not isinstance(axis, Variable):
+        axis_tensor = fill_constant(shape=[1], dtype='int64', value=axis)
+    if in_dygraph_mode():
+        return core.ops.gather(x, index, axis_tensor)
+
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'gather')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
+    if isinstance(axis, Variable):
+        check_variable_and_dtype(axis, 'axis', ['int32', 'int64'], 'gather')
+    else:
+        check_type(axis, 'axis', (int), 'gather')
+
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="gather",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": out},
-        attrs={'overwrite': overwrite})
+        inputs={"X": x,
+                "Index": index,
+                "Axis": axis_tensor},
+        outputs={"Out": out})
     return out
 
 
@@ -787,3 +916,523 @@ def unbind(input, axis=0):
         outputs={"Out": outs},
         attrs={"axis": axis})
     return outs
+
+
+def scatter(x, index, updates, overwrite=True, name=None):
+    """
+    **Scatter Layer**
+    Output is obtained by updating the input on selected indices based on updates.
+    
+    .. code-block:: python
+        import numpy as np
+        #input:
+        x = np.array([[1, 1], [2, 2], [3, 3]])
+        index = np.array([2, 1, 0, 1])
+        # shape of updates should be the same as x
+        # shape of updates with dim > 1 should be the same as input
+        updates = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
+        overwrite = False
+        # calculation:
+        if not overwrite:
+            for i in range(len(index)):
+                x[index[i]] = np.zeros((2))
+        for i in range(len(index)):
+            if (overwrite):
+                x[index[i]] = updates[i]
+            else:
+                x[index[i]] += updates[i]
+        # output:
+        out = np.array([[3, 3], [6, 6], [1, 1]])
+        out.shape # [3, 2]
+
+    **NOTICE**: The order in which updates are applied is nondeterministic, 
+    so the output will be nondeterministic if index contains duplicates.
+
+    Args:
+        x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
+        index (Tensor): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
+        updates (Tensor): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
+        overwrite (bool): The mode that updating the output when there are same indices. 
+          If True, use the overwrite mode to update the output of the same index,
+	      if False, use the accumulate mode to update the output of the same index.Default value is True.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
+ 
+    Returns:
+        Tensor: The output is a Tensor with the same shape as x.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+
+            x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32)
+            
+            x = paddle.to_tensor(x_data)
+            index = paddle.to_tensor(index_data)
+            updates = paddle.to_tensor(updates_data)
+  
+            output1 = paddle.scatter(x, index, updates, overwrite=False)
+            # [[3., 3.],
+            #  [6., 6.],
+            #  [1., 1.]]
+
+            output2 = paddle.scatter(x, index, updates, overwrite=True)
+            # CPU device:
+            # [[3., 3.],
+            #  [4., 4.],
+            #  [1., 1.]]
+            # GPU device maybe have two results because of the repeated numbers in index
+            # result 1:
+            # [[3., 3.],
+            #  [4., 4.],
+            #  [1., 1.]]
+            # result 2:
+            # [[3., 3.],
+            #  [2., 2.],
+            #  [1., 1.]]
+    """
+    if in_dygraph_mode():
+        return core.ops.scatter(x, index, updates, 'overwrite', overwrite)
+
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'scatter')
+    check_type(overwrite, 'overwrite', bool, 'scatter')
+    helper = LayerHelper('scatter', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": x,
+                "Ids": index,
+                "Updates": updates},
+        attrs={'overwrite': overwrite},
+        outputs={"Out": out})
+    return out
+
+
+def chunk(x, chunks, axis=0, name=None):
+    """
+    Split the input tensor into multiple sub-Tensors.
+    
+    Args:
+        x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
+        chunks(int): The number of tensor to be split along the certain axis.
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+            ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
+            If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+    Returns:
+        list(Tensor): The list of segmented Tensors.
+    Raises:
+        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
+        TypeError: ``chunks`` is not int.
+        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    Example:
+        .. code-block:: python
+            
+            import numpy as np
+            import paddle
+            
+            paddle.disable_static()
+            # x is a Tensor which shape is [3, 9, 5]
+            x_np = np.random.random([3, 9, 5]).astype("int32")
+            x = paddle.to_tensor(x_np)
+
+            out0, out1, out22 = paddle.chunk(x, chunks=3, axis=1)
+            # out0.shape [3, 3, 5]
+            # out1.shape [3, 3, 5]
+            # out2.shape [3, 3, 5]
+
+            
+            # axis is negative, the real axis is (rank(x) + axis) which real
+            # value is 1.
+            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
+            # out0.shape [3, 3, 5]
+            # out1.shape [3, 3, 5]
+            # out2.shape [3, 3, 5]
+    """
+    check_type(chunks, 'chunks', (int), 'chunk')
+    return paddle.fluid.layers.split(
+        input=x, num_or_sections=chunks, dim=axis, name=name)
+
+
+def tile(x, repeat_times, name=None):
+    """
+
+    Construct a new Tensor by repeating ``x`` the number of times given by ``repeat_times``.
+    After tiling, the value of the i'th dimension of the output is equal to ``x.shape[i]*repeat_times[i]``.
+
+    Both the number of dimensions of ``x`` and the number of elements in ``repeat_times`` should be less than or equal to 6.
+
+    Args:
+        x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64.
+        repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements
+            should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. The data type is the same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            np_data = np.array([1, 2, 3]).astype('int32')
+            data = paddle.to_tensor(np_data)
+            out = paddle.tile(data, repeat_times=[2, 1])
+            np_out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+
+            out = paddle.tile(data, repeat_times=[2, 2])
+            np_out = out.numpy()
+            # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
+
+            np_repeat_times = np.array([2, 1]).astype("int32")
+            repeat_times = paddle.to_tensor(np_repeat_times)
+            out = paddle.tile(data, repeat_times=repeat_times)
+            np_out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+    """
+    if in_dygraph_mode():
+        return core.ops.tile(x, 'repeat_times', repeat_times)
+
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
+    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
+    if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
+        raise ValueError(
+            "When the date type is bool for the input 'x' of tile op, you "
+            "must set its stop_gradient to be True by "
+            "some_var.stop_gradient == True supporting some_var is the input.")
+
+    helper = LayerHelper('tile', **locals())
+
+    inputs = {"X": [x]}
+    attrs = {}
+
+    def get_attr_repeat_times(list_repeat_times):
+        attrs_repeat_times = []
+        for idx, times in enumerate(list_repeat_times):
+            if isinstance(times, Variable):
+                attrs_repeat_times.append(-1)
+            else:
+                attrs_repeat_times.append(times)
+                assert times > 0, (
+                    "All elements in repeat_times must be positive for tile.")
+        return attrs_repeat_times
+
+    if isinstance(repeat_times, Variable):
+        repeat_times.stop_gradient = True
+        inputs['RepeatTimes'] = repeat_times
+        attrs['repeat_times'] = [-1]
+    elif isinstance(repeat_times, (list, tuple)):
+        attrs['repeat_times'] = get_attr_repeat_times(repeat_times)
+        if utils._contain_var(repeat_times):
+            inputs['repeat_times_tensor'] = utils._convert_to_tensor_list(
+                repeat_times)
+
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='tile', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
+def expand_as(x, y, name=None):
+    """
+
+    Expand the input tensor ``x`` to the same shape as the input tensor ``y``.
+
+    Both the number of dimensions of ``x`` and ``y`` must be less than or equal to 6, and the number of dimensions of ``y`` must be greather than or equal to that of ``x``. The dimension to expand must have a value of 1.
+
+    Args:
+        x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
+        y (Tensor): The input tensor that gives the shape to expand to.
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor: A Tensor with the same shape as ``y``. The data type is the same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            np_data_x = np.array([1, 2, 3]).astype('int32')
+            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype('int32')
+            data_x = paddle.to_tensor(np_data_x)
+            data_y = paddle.to_tensor(np_data_y)
+            out = paddle.expand_as(data_x, data_y)
+            np_out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+    """
+    if in_dygraph_mode():
+        return core.ops.expand_as_v2(x, y)
+
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as')
+    check_type(y, 'y', Variable, 'expand_as')
+
+    if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
+        raise ValueError(
+            "When the data type of input 'x' for expand_as is bool, "
+            "you must set its stop_gradient to be False by "
+            "some_var.stop_gradient = True, supporting "
+            "some_var as the input 'x'.")
+    inputs = {"X": [x], "target_tensor": [y]}
+
+    helper = LayerHelper('expand_as', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type='expand_as_v2', inputs=inputs, outputs={'Out': out})
+    return out
+
+
+def expand(x, shape, name=None):
+    """
+
+    Expand the input tensor to a given shape.
+
+    Both the number of dimensions of ``x`` and the number of elements in ``shape`` should be less than or equal to 6. The dimension to expand must have a value 1.
+
+
+    Args:
+        x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
+        shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            The value -1 in shape means keeping the corresponding dimension unchanged.
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        N-D Tensor: A Tensor with the given shape. The data type is the same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            np_data = np.array([1, 2, 3]).astype('int32')
+            data = paddle.to_tensor(np_data)
+            out = paddle.expand(data, shape=[2, 3])
+            out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+    """
+    if in_dygraph_mode():
+        return core.ops.expand_v2(x, 'shape', shape)
+
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+    check_type(shape, 'shape', (list, tuple, Variable), 'expand')
+
+    inputs = {"X": [x]}
+    attrs = {}
+    if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
+        raise ValueError("When the data type of input 'x' for expand is bool, "
+                         "you must set its stop_gradient to be False by "
+                         "some_var.stop_gradient = True, supporting "
+                         "some_var as the input.")
+
+    helper = LayerHelper('expand', **locals())
+
+    def get_attr_expand_shape(list_expand_shape):
+        attrs_expand_shape = []
+        for idx, shape in enumerate(list_expand_shape):
+            if isinstance(shape, Variable):
+                attrs_expand_shape.append(-1)
+            else:
+                attrs_expand_shape.append(shape)
+                assert shape > 0 or shape == -1, (
+                    "All elements in shape of expand must be positive or -1.")
+        return attrs_expand_shape
+
+    if isinstance(shape, Variable):
+        shape.stop_gradient = True
+        inputs['Shape'] = shape
+    elif isinstance(shape, (list, tuple)):
+        attrs['shape'] = get_attr_expand_shape(shape)
+        if utils._contain_var(shape):
+            inputs['expand_shapes_tensor'] = utils._convert_to_tensor_list(
+                shape)
+
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
+broadcast_to = expand
+
+
+def reshape(x, shape, name=None):
+    """
+    :alias_main: paddle.reshape
+	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
+
+    This operator changes the shape of ``x`` without changing its data.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The index of 0s in shape can not exceed
+    the dimension of x.
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
+                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
+                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.
+                            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor: A reshaped Tensor with the same data type as ``x``.
+
+    Raises:
+        ValueError: If more than one elements of ``shape`` is -1.
+        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
+        ValueError: If the elements in ``shape`` is negative except -1.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            data = np.random.random([2, 4, 6]).astype("float32")
+            x = paddle.to_tensor(data)
+
+            positive_four = paddle.fill_constant([1], "int32", 4)
+
+            out_1 = paddle.reshape(x, [-1, 0, 3, 2])
+            # the shape of out_1 is [2,4,3,2].
+
+            out_2 = paddle.reshape(x, shape=[positive_four, 12])
+            # the shape of out_2 is [4, 12].
+
+            shape_tensor = paddle.to_tensor(np.array([8, 6]).astype("int32"))
+            out_3 = paddle.reshape(x, shape=shape_tensor)
+            # the shape of out_2 is [8, 6].
+    """
+    return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
+
+
+def gather_nd(x, index, name=None):
+    """
+
+    This function is actually a high-dimensional extension of :code:`gather`
+    and supports for simultaneous indexing by multiple axes. :attr:`index` is a
+    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional
+    tensor of :attr:`index` into :attr:`input`, where each element defines
+    a slice of params:
+
+    .. math::
+
+        output[(i_0, ..., i_{K-2})] = input[index[(i_0, ..., i_{K-2})]]
+
+    Obviously, :code:`index.shape[-1] <= input.rank` . And, the output tensor has
+    shape :code:`index.shape[:-1] + input.shape[index.shape[-1]:]` .
+
+    .. code-block:: text
+
+            Given:
+                x =  [[[ 0,  1,  2,  3],
+                       [ 4,  5,  6,  7],
+                       [ 8,  9, 10, 11]],
+                      [[12, 13, 14, 15],
+                       [16, 17, 18, 19],
+                       [20, 21, 22, 23]]]
+                x.shape = (2, 3, 4)
+
+            * Case 1:
+                index = [[1]]
+
+                gather_nd(x, index)
+                         = [x[1, :, :]]
+                         = [[12, 13, 14, 15],
+                            [16, 17, 18, 19],
+                            [20, 21, 22, 23]]
+
+            * Case 2:
+                index = [[0,2]]
+
+                gather_nd(x, index)
+                         = [x[0, 2, :]]
+                         = [8, 9, 10, 11]
+
+            * Case 3:
+                index = [[1, 2, 3]]
+
+                gather_nd(x, index)
+                         = [x[1, 2, 3]]
+                         = [23]
+
+    Args:
+        x (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
+        index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
+                        Its dtype should be int32, int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of float32, float64, int32 and int64.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            np_x = np.array([[[1, 2], [3, 4], [5, 6]],
+                             [[7, 8], [9, 10], [11, 12]]])
+            np_index = [[0, 1]]
+            x = paddle.to_tensor(np_x)
+            index = paddle.to_tensor(np_index)
+            
+            output = paddle.gather_nd(x, index) #[[3, 4]]
+
+    """
+
+    return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
old mode 100644
new mode 100755
index f8fa29757d86d42734989147b13f9eb012f3d86f..0d87c1c2cf705372de7b8534cf8faea1bb5320a6
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -15,14 +15,16 @@
 math functions
 """
 from __future__ import print_function
+import numpy as np
 
 from paddle.common_ops_import import *
+from paddle.tensor import cast
+import paddle
 from ..fluid import layers
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn
-import sys
+from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
 
 # TODO: define math functions
 # yapf: disable
@@ -33,12 +35,9 @@ from ..fluid.layers import ceil    #DEFINE_ALIAS
 from ..fluid.layers import cos    #DEFINE_ALIAS
 from ..fluid.layers import sinh    #DEFINE_ALIAS
 from ..fluid.layers import cosh    #DEFINE_ALIAS
-from ..fluid.layers import cumsum    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_floordiv    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_max    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_min    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_mod    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_mul    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_pow    #DEFINE_ALIAS
@@ -54,18 +53,19 @@ from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
 from ..fluid.layers import round    #DEFINE_ALIAS
 from ..fluid.layers import rsqrt    #DEFINE_ALIAS
 from ..fluid.layers import scale    #DEFINE_ALIAS
-from ..fluid.layers import sign    #DEFINE_ALIAS
 from ..fluid.layers import square    #DEFINE_ALIAS
 from ..fluid.layers import stanh    #DEFINE_ALIAS
 from ..fluid.layers import atan    #DEFINE_ALIAS
 from ..fluid.layers import erf    #DEFINE_ALIAS
 from ..fluid.layers import sqrt    #DEFINE_ALIAS
 from ..fluid.layers import sin    #DEFINE_ALIAS
-from ..fluid.layers import tanh    #DEFINE_ALIAS
 
 from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
 from ..fluid.layers import sums    #DEFINE_ALIAS
+from ..fluid import layers
+import paddle
+
 
 __all__ = [
         'abs',
@@ -79,8 +79,6 @@ __all__ = [
         'elementwise_add',
         'elementwise_div',
         'elementwise_floordiv',
-        'elementwise_max',
-        'elementwise_min',
         'elementwise_mod',
         'elementwise_pow',
         'elementwise_sub',
@@ -88,9 +86,11 @@ __all__ = [
         'floor',
         'increment',
         'log',
+        'logsumexp',
         'mul',
         'multiplex',
         'pow',
+        'prod',
         'reciprocal',
         'reduce_max',
         'reduce_min',
@@ -110,9 +110,15 @@ __all__ = [
         'tanh',
         'elementwise_sum',
         'max',
+        'maximum',
         'min',
+        'minimum',
         'mm',
-        'div',
+        'divide',
+        'floor_divide',
+        'remainder',
+        'mod',
+        'floor_mod',
         'multiply',
         'add',
         'atan',
@@ -122,70 +128,109 @@ __all__ = [
         'erf',
         'addcmul',
         'addmm',
-        'clamp',
+        'clip',
         'trace',
-        'kron'
+        'kron',
+        'isfinite',
+        'isinf',
+        'isnan'
 ]
 # yapf: enable.
 
-@templatedoc()
-def pow(input, exponent, name=None):
+_supported_int_dtype_ = [
+    VarDesc.VarType.UINT8,
+    VarDesc.VarType.INT8,
+    VarDesc.VarType.INT16,
+    VarDesc.VarType.INT32,
+    VarDesc.VarType.INT64,
+]
+
+_supported_float_dtype_ = [
+    VarDesc.VarType.FP32,
+    VarDesc.VarType.FP64,
+]
+
+def pow(x, y, name=None):
     """
-	:alias_main: paddle.pow
-	:alias: paddle.pow,paddle.tensor.pow,paddle.tensor.math.pow
+    Compute the power of tensor elements. The equation is:
 
-    This is Pow Activation Operator.
+    .. math::
+        out = x^{y} 
 
-    :math:`out = input^{exponent}`
+    **Note**:
+    ``paddle.pow`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-    Args:
-        input(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32`` or ``float64``.
-        exponent(float32|Variable): A scalar with type ``float32`` or a ``Tensor`` with shape [1] and type ``float32``.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
 
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        y (Tensor): An N-D Tensor with type float32, float64, int32 or int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+    
     Returns:
-        Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``input``.
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
 
     Examples:
 
-        .. code-block:: python
+        ..  code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-
-            x = fluid.data(name="x", shape=[32,32], dtype="float32")
+            import numpy as np
 
-            # example 1: argument exponent is float
-            y_1 = paddle.pow(x, 2.0)
-            # y_1 is x^{2.0}
+            paddle.disable_static()
+            
+            # example 1: y is a float
+            x_data = np.array([1, 2, 3])
+            y = 2
+            x = paddle.to_tensor(x_data)
+            res = paddle.pow(x, y)
+            print(res.numpy()) # [1 4 9]
+            
+            # example 2: y is a Tensor
+            y = paddle.fill_constant(shape=[1], value=2, dtype='float32')
+            res = paddle.pow(x, y)
+            print(res.numpy()) # [1 4 9]
 
-            # example 2: argument exponent is Variable
-            exponent_tensor = fluid.layers.fill_constant([1], "float32", 3.0)
-            y_2 = paddle.pow(x, exponent_tensor)
-            # y_2 is x^{3.0}
     """
+    # in dynamic graph mode
     if in_dygraph_mode():
-        return core.ops.pow(input, "exponent", exponent)
-
-    helper = LayerHelper('pow', **locals())
-    inputs = {'X': input}
-    attrs = {}
-    if isinstance(exponent, Variable):
-        exponent.stop_gradient = True
-        inputs['FactorTensor'] = exponent
+        if isinstance(y, (int, float)):
+            return core.ops.pow(x, 'factor', y)
+        elif isinstance(y, (paddle.Tensor, Variable)):
+
+            if x.dtype != y.dtype:
+                y = cast(y, dtype='float64')
+                x = cast(x, dtype='float64')
+                out_dygraph = _elementwise_op_in_dygraph(
+                x, y, axis=-1, act=None, op_name='elementwise_pow')
+                return out_dygraph
+
+            return _elementwise_op_in_dygraph(
+                x, y, axis=-1, act=None, op_name='elementwise_pow')
+        else:
+            raise TypeError('y must be scalar or tensor type, but received: %s '% (y.dtype))
+    # in static graph mode
     else:
-        attrs['factor'] = exponent
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    check_dtype(
-        out.dtype, out.name,
-        convert_dtype(input.dtype), 'pow',
-        '(The out data type in pow must be the same with input data type.)')
+        if isinstance(y, (int, float)):
+            helper = LayerHelper('pow', **locals())
+            inputs = {'X': x}
+            attrs = {'factor': y}
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            helper.append_op(
+                type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+            return out
+        elif isinstance(y, (paddle.Tensor, Variable)):
+            # TODO A potential speed improvement is supporting different types in C++ and removing the cast ops here
+            helper = LayerHelper('elementwise_pow', **locals())
+            if x.dtype != y.dtype:
+                y = cast(y, dtype='float64')
+                x = cast(x, dtype='float64')
+                out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            else:
+                out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
+        else:
+            raise TypeError('y must be scalar or tensor type, but received: %s '% (type(y)))
 
-    helper.append_op(
-        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
 
 
 @dygraph_only
@@ -208,6 +253,8 @@ def _elementwise_op(helper):
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
 
+    out = helper.kwargs.get('out', None)
+
     assert x is not None, 'x cannot be None in {}'.format(original_op_type)
     assert y is not None, 'y cannot be None in {}'.format(original_op_type)
     check_variable_and_dtype(
@@ -220,11 +267,12 @@ def _elementwise_op(helper):
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+
+    if out is None:
+        if name is None:
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        else:
+            out = helper.create_variable(name=name, dtype=x.dtype, persistable=False)
 
     helper.append_op(
         type=op_type,
@@ -236,251 +284,367 @@ def _elementwise_op(helper):
     return helper.append_activation(out)
 
 
-def add(x, y, alpha=1, name=None):
+def add(x, y, name=None):
     """
 Examples:
 
-    .. code-block:: python
+    ..  code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
+        paddle.disable_static()
+        np_x = np.array([2, 3, 4]).astype('float64')
+        np_y = np.array([1, 5, 2]).astype('float64')
+        x = paddle.to_variable(np_x)
+        y = paddle.to_variable(np_y)
+        z = paddle.add(x, y)
+        np_z = z.numpy()
+        print(np_z)  # [3., 8., 6. ]
 
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z1 = paddle.add(x, y)
-        z2 = paddle.add(x, y, alpha=10)
-        # z = x + y
+    """
+    op_type = 'elementwise_add'
+    axis = -1
+    if in_dygraph_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, op_name=op_type)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z1.name, z2.name])
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        print(z_value[0]) # [3., 8., 6.]
-        print(z_value[1]) # [12. 53. 24.]
 
+def divide(x, y, name=None):
+    """
+    Divide two tensors element-wise. The equation is:
 
-    .. code-block:: python
+    .. math::
+        out = x / y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.divide`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((4, 5)).astype('float32')
-            }
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        x = fluid.data(name="x", shape=[2, 3, 4, 5], dtype='float32')
-        y = fluid.data(name="y", shape=[4, 5], dtype='float32')
-        z = paddle.add(x, y, name='z')
-        # z = x + y
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    Examples:
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+        ..  code-block:: python
 
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+            import paddle
+            import numpy as np
 
+            paddle.disable_static()
 
-    ..  code-block:: python
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.divide(x, y)
+            print(z.numpy())  # [2., 0.6, 2.]
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    """
+    op_type = 'elementwise_div'
+    axis = -1
+    act = None
+    if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        # rule 2: both the inputs are not Tensor
+        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
+            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
+
+        # rule 3: both the inputs are Tensor
+        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+            elif x.dtype in _supported_int_dtype_:
+                x = x.astype(paddle.get_default_dtype())
+                y = y.astype(paddle.get_default_dtype())
+
+        # rule 4: x is Tensor, y is scalar
+        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            if x.dtype in _supported_int_dtype_:
+                x = x.astype(paddle.get_default_dtype())
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
+        # rule 5: x is scalar, y is Tensor
+        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype in _supported_int_dtype_:
+                y = y.astype(paddle.get_default_dtype())
+            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
 
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = paddle.add(x, y)
-        # z = x / y
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    # rule 2: both the inputs are not Tensor
+    elif not isinstance(x, Variable) and not isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
+        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+        elif x.dtype in _supported_int_dtype_:
+            x = paddle.cast(x, paddle.get_default_dtype())
+            y = paddle.cast(y, paddle.get_default_dtype())
+
+    # rule 4: x is Tensor, y is scalar
+    elif isinstance(x, Variable) and not isinstance(y, Variable):
+        if x.dtype in _supported_int_dtype_:
+            x = paddle.cast(x, paddle.get_default_dtype())
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
+
+    # rule 5: x is scalar, y is Tensor
+    elif not isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype in _supported_int_dtype_:
+            y = paddle.cast(y, paddle.get_default_dtype())
+        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
 
+def floor_divide(x, y, name=None):
+    """
+    Floor divide two tensors element-wise. The equation is:
 
-    ..  code-block:: python
+    .. math::
+        out = x // y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.floor_divide`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        x = fluid.data(name="x", shape=[3], dtype="float32")
-        y = fluid.data(name='y', shape=[3], dtype='float32')
-        z = paddle.add(x, y)
+    Args:
+        x (Tensor): the input tensor, it's data type should be int32, int64.
+        y (Tensor): the input tensor, it's data type should be int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        data1 = np.array([2, 3, 4], dtype='float32')
-        data2 = np.array([1, 5, 2], dtype='float32')
-        z_value = exe.run(feed={'x': data1,
-                                'y': data2},
-                                fetch_list=[z])
-        print(z_value[0]) # [3. 8. 6.]
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.add(x, y, alpha=-0.5)
-            np_z = z.numpy()
-            print(np_z)  # [1.5, 0.5, 3. ]
+            paddle.disable_static()
+
+            np_x = np.array([2, 3, 8, 7])
+            np_y = np.array([1, 5, 3, 3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.floor_divide(x, y)
+            print(z.numpy())  # [2, 0, 2, 2]
 
     """
-    op_type = 'elementwise_add'
+    op_type = 'elementwise_floordiv'
     axis = -1
-    act = None
-    if alpha != 1:
-        y = scale(y, scale=alpha)
     if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        # rule 2: both the inputs are not Tensor
+        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
+            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
+
+        # rule 3: both the inputs are Tensor
+        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+        # rule 4: x is Tensor, y is scalar
+        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
+        # rule 5: x is scalar, y is Tensor
+        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
+
         return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name=op_type)
+            x, y, axis=axis, op_name=op_type)
+
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    # rule 2: both the inputs are not Tensor
+    elif not isinstance(x, Variable) and not isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
+        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+    # rule 4: x is Tensor, y is scalar
+    elif isinstance(x, Variable) and not isinstance(y, Variable):
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
+
+    # rule 5: x is scalar, y is Tensor
+    elif not isinstance(x, Variable) and isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
 
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
-def div(x, y, name=None):
+def remainder(x, y, name=None):
     """
-Examples:
+    Mod two tensors element-wise. The equation is:
 
-    .. code-block:: python
+    .. math::
+        out = x \% y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.remainder`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
+    Args:
+        x (Tensor): the input tensor, it's data type should be int32, int64.
+        y (Tensor): the input tensor, it's data type should be int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = paddle.div(x, y)
-        # z = x / y
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+    Examples:
 
-        print(z_value) # [2., 0.6, 2.]
+        ..  code-block:: python
 
+            import paddle
+            import numpy as np
 
-    .. code-block:: python
+            paddle.disable_static()
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+            np_x = np.array([2, 3, 8, 7])
+            np_y = np.array([1, 5, 3, 3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.remainder(x, y)
+            print(z.numpy())  # [0, 3, 2, 1]
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((4, 5)).astype('float32')
-            }
+    """
+    op_type = 'elementwise_mod'
+    axis = -1
+    if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
 
-        x = fluid.data(name="x", shape=[2, 3, 4, 5], dtype='float32')
-        y = fluid.data(name="y", shape=[4, 5], dtype='float32')
-        z = paddle.div(x, y, name='z')
-        # z = x / y
+        elif not isinstance(x, paddle.Tensor):
+            raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        # rule 3: both the inputs are Tensor
+        elif isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+        # rule 4: x is Tensor, y is scalar
+        elif not isinstance(y, paddle.Tensor):
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
 
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, op_name=op_type)
 
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
 
-    ..  code-block:: python
+    elif not isinstance(x, Variable):
+        raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    # rule 3: both the inputs are Tensor
+    elif isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
+    # rule 4: x is Tensor, y is scalar
+    elif not isinstance(y, paddle.Tensor):
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
 
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = paddle.div(x, y)
-        # z = x / y
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+mod = remainder  #DEFINE_ALIAS
+floor_mod = remainder  #DEFINE_ALIAS
 
 
-    ..  code-block:: python
+def multiply(x, y, axis=-1, name=None):
+    """
+    multiply two tensors element-wise. The equation is:
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    .. math::
+        out = x * y
 
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.div(x, y)
-            np_z = z.numpy()
-            print(np_z)  # [2., 0.6, 2.]
+    **Note**:
+    ``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
+
+    Args:
+        x (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
+
+    Examples:
+
+        ..  code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y)
+            print(res.numpy()) # [[5, 12], [21, 32]]
+
+            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+            y_data = np.array([1, 2], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y, axis=1)
+            print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
     """
-    op_type = 'elementwise_div'
-    axis = -1
+    op_type = 'elementwise_mul'
     act = None
+    if x.dtype != y.dtype:
+        raise TypeError(
+            'Input tensors must be same type, but received type of x: %s, type of y: %s '
+            % (x.dtype, y.dtype))
+
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
-
-def multiply(x, y, axis=-1, name=None):
+def maximum(x, y, axis=-1, name=None):
     """
-	:alias_main: paddle.multiply
-	:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
-
 Examples:
 
     .. code-block:: python
@@ -488,131 +652,201 @@ Examples:
         import paddle
         import numpy as np
 
-        paddle.enable_imperative()
+        paddle.disable_static()
+  
         x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
         y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
-        res = paddle.multiply(x, y)
-        print(res.numpy()) # [[5, 12], [21, 32]]
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y)
+        print(res.numpy())
+        #[[5. 6.]
+        # [7. 8.]]
 
         x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
         y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
-        res = paddle.multiply(x, y, axis=1)
-        print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
-
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y, axis=1)
+        print(res.numpy())
+        #[[[1. 2. 3.]
+        #  [2. 2. 3.]]]
+
+        x_data = np.array([2, 3, 5], dtype=np.float32)
+        y_data = np.array([1, 4, np.nan], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y)
+        print(res.numpy())
+        #[ 2.  4. nan]
+
+        x_data = np.array([5, 3, np.inf], dtype=np.float32)
+        y_data = np.array([1, 4, 5], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y)
+        print(res.numpy())
+        #[ 5.  4. inf]
     """
-    op_type = 'elementwise_mul'
+    op_type = 'elementwise_max'
     act = None
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
+def minimum(x, y, axis=-1, name=None):
+    """
+Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+        paddle.disable_static()
+  
+        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y)
+        print(res.numpy())
+        #[[1. 2.]
+        # [3. 4.]]
+
+        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+        y_data = np.array([1, 2], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y, axis=1)
+        print(res.numpy())
+        #[[[1. 1. 1.]
+        #  [2. 2. 2.]]]
+
+        x_data = np.array([2, 3, 5], dtype=np.float32)
+        y_data = np.array([1, 4, np.nan], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y)
+        print(res.numpy())
+        #[ 1.  3. nan]
+
+        x_data = np.array([5, 3, np.inf], dtype=np.float32)
+        y_data = np.array([1, 4, 5], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y)
+        print(res.numpy())
+        #[1. 3. 5.]
+    """
+    op_type = 'elementwise_min'
+    act = None
+    if in_dygraph_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
 for func in [
         add,
-        div,
-        multiply,
+        maximum,
+        minimum,
+        multiply
 ]:
-    proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div', 'multiply': 'elementwise_mul'}
+    proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div', 'maximum': 'elementwise_max', 'minimum': 'elementwise_min', 'multiply': 'elementwise_mul'}
     op_proto = OpProtoHolder.instance().get_op_proto(proto_dict[func.__name__])
-    if func.__name__ in ['add']:
-        alias_main = ':alias_main: paddle.%(func)s' % {'func': func.__name__}
-        alias = ':alias: paddle.%(func)s, paddle.tensor.%(func)s, paddle.tensor.math.%(func)s' % {'func': func.__name__}
-
-        additional_args_lines = [
-            "alpha (int|float, optional): The alpha factor of the input. Default is 1. If alpha is not 1, the equation becomes Out = X + alpha * Y.",
-            "name (string, optional): Name of the output. \
-            Default is None. It's used to print debug info for developers. Details: \
-            :ref:`api_guide_Name` "
-        ]
-    else:
-        additional_args_lines = [
-            "name (string, optional): Name of the output. \
-            Default is None. It's used to print debug info for developers. Details: \
-            :ref:`api_guide_Name` "
-        ]
 
-    func.__doc__ = alias_main + """\n""" + alias + """\n""" + _generate_doc_string_(
+    additional_args_lines = [
+        "name (string, optional): Name of the output. \
+        Default is None. It's used to print debug info for developers. Details: \
+        :ref:`api_guide_Name` "
+    ]
+
+    func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=additional_args_lines,
         skip_attrs_set={"x_data_format", "y_data_format", "axis",
             "use_quantizer", "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
         }) + """\n""" + str(func.__doc__)
 
-def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
-    """
-	:alias_main: paddle.sum
-	:alias: paddle.sum,paddle.tensor.sum,paddle.tensor.math.sum
 
+def sum(x, axis=None, dtype=None, keepdim=False, name=None):
+    """
     Computes the sum of tensor elements over the given dimension.
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
-            float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the sum is performed. If
-            :attr:`None`, sum all elements of :attr:`input` and return a
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        axis (int|list|tuple, optional): The dimensions along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`x` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-            the dimension to reduce is :math:`rank + dim[i]`.
-        dtype(str, optional): The dtype of output tensor. The default value is None, the dtype
-            of output is the same as input tensor.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            range :math:`[-rank(x), rank(x))`. If :math:`axis[i] < 0`,
+            the dimension to reduce is :math:`rank + axis[i]`.
+        dtype (str, optional): The dtype of output Tensor. The default value is None, the dtype
+            of output is the same as input Tensor `x`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result Tensor will have one fewer dimension
+            than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None. Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, results of summation operation on the specified dim of input tensor,
-        it's data type is the same as input's Tensor.
+        Tensor: Results of summation operation on the specified axis of input Tensor `x`,
+        it's data type is the same as `x`.
 
     Raises:
-        ValueError, the :attr:`dtype` must be float64 or int64.
+        ValueError: The :attr:`dtype` must be float64 or int64.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
 
     Examples:
         .. code-block:: python
 
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
+            paddle.disable_static()
+
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
+            x_data = np.array([[0.2, 0.3, 0.5, 0.9],[0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x = paddle.to_variable(x_data)
             out1 = paddle.sum(x)  # [3.5]
-            out2 = paddle.sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
-            out3 = paddle.sum(x, dim=-1)  # [1.9, 1.6]
-            out4 = paddle.sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+            out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
+            out3 = paddle.sum(x, axis=-1)  # [1.9, 1.6]
+            out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
 
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            out5 = paddle.sum(y, dim=[1, 2]) # [10, 26]
-            out6 = paddle.sum(y, dim=[0, 1]) # [16, 20]
-
+            y_data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype('float32')
+            y = paddle.to_variable(y_data)
+            out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
+            out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
     """
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+    if axis is not None and not isinstance(axis, (list, tuple)):
+        axis = [axis]
+
+    if not axis:
+        reduce_all_flag = True
+    else:
+        if len(axis) == len(x.shape):
+            reduce_all_flag = True
+        else:
+            reduce_all_flag = False
+
     attrs = {
-        'dim': dim if dim != None and dim != [] else [0],
-        'keep_dim': keep_dim,
-        'reduce_all': True if dim == None or dim == [] else False,
+        'dim': axis if axis != None and axis != [] and axis != () else [0],
+        'keep_dim': keepdim,
+        'reduce_all': reduce_all_flag
     }
     dtype_flag = False
     if dtype is not None:
         if dtype in ['float64', 'int64']:
-            if (convert_dtype(input.dtype) == "float32" and dtype == "float64") or \
-               (convert_dtype(input.dtype) == "int32" and dtype == "int64"):
+            if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \
+               (convert_dtype(x.dtype) == "int32" and dtype == "int64"):
                 attrs.update({
-                    'in_dtype': input.dtype,
+                    'in_dtype': x.dtype,
                     'out_dtype': convert_np_dtype_to_dtype_(dtype)
                 })
                 dtype_flag = True
@@ -622,27 +856,28 @@ def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
                 format(dtype))
 
     if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] else False
-        dim = dim if dim != None and dim != [] else [0]
+        axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
-            return core.ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                       'reduce_all', reduce_all, 'in_dtype',
-                                       input.dtype, 'out_dtype',
+            return core.ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag, 'in_dtype',
+                                       x.dtype, 'out_dtype',
                                        convert_np_dtype_to_dtype_(dtype))
         else:
-            return core.ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                       'reduce_all', reduce_all)
+            return core.ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag)
     check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_sum')
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
+    check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
+
     helper = LayerHelper('sum', **locals())
     if dtype_flag:
         out = helper.create_variable_for_type_inference(
             dtype=convert_np_dtype_to_dtype_(dtype))
     else:
-        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reduce_sum',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs=attrs)
     return out
@@ -884,11 +1119,11 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
             data_y = np.ones((2, 2)).astype(np.float32)
             data_input = np.ones((2, 2)).astype(np.float32)
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
-            x = paddle.imperative.to_variable(data_x)
-            y = paddle.imperative.to_variable(data_y)
-            input = paddle.imperative.to_variable(data_input)
+            x = paddle.to_variable(data_x)
+            y = paddle.to_variable(data_y)
+            input = paddle.to_variable(data_input)
 
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
 
@@ -934,81 +1169,83 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     return out
 
 
-def logsumexp(x, dim=None, keepdim=False, name=None):
+def logsumexp(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.logsumexp
-	:alias: paddle.logsumexp,paddle.tensor.logsumexp,paddle.tensor.math.logsumexp
-
-    This operator calculates the log of the sum of exponentials of the input Tensor.
+    This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
        logsumexp(x) = \log\sum exp(x)
 
-
-    Parameters:
-       x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
-       dim (list|int, optional): The dimensions along which the sum is performed. If :attr:`None`,
-         sum all elements of :attr:`input` and return a Tensor variable with a single element,
-         otherwise must be in the range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-         the dimension to reduce is :math:`rank + dim[i]`.
-       keep_dim (bool, optional): Whether to reserve the reduced dimension in the output Tensor.
-         The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim`
-         is true, default value is False.
-       name (str, optional): The default value is None.  Normally there is no need for user to
-         set this property.  For more information, please refer to :ref:`api_guide_Name`
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            logsumexp calculations. ``axis`` should be int, list(int) or
+            tuple(int). If ``axis`` is a list/tuple of dimension(s), logsumexp
+            is calculated along all element(s) of ``axis`` . ``axis`` or
+            element(s) of ``axis`` should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is
+            less than 0, it works the same way as :math:`axis + D` . If
+            ``axis`` is None, logsumexp is calculated along all elements of
+            ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keep_dim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       Variable: The calcuated result Tensor/LoDTensor.
+        Tensor, results of logsumexp along ``axis`` of ``x``, with the same data
+        type as ``x``.
 
     Examples:
 
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
-        with fluid.dygraph.guard():
-          np_x = np.random.uniform(0.1, 1, [10]).astype(np.float32)
-          x = fluid.dygraph.to_variable(np_x)
-          print(paddle.logsumexp(x).numpy())
-
-    ..  code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+        paddle.disable_static()
 
-        with fluid.dygraph.guard():
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            print(paddle.logsumexp(x, dim=1).numpy())
-            print(paddle.logsumexp(x, dim=[0, 2]).numpy())
+        x = np.array([[-1.5, 0., 2.], [3., 1.2, -2.4]])
+        x = paddle.to_tensor(x)
+        out1 = paddle.logsumexp(x) # [3.4691226]
+        out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
 
     """
-    op_type = 'logsumexp'
-    assert x is not None, 'x cannot be None in {}'.format(op_type)
+    if isinstance(axis, int):
+        axis = [axis]
+    reduce_all = True if axis is None \
+        or len(axis)==0 \
+        or len(axis) == len(x.shape) else False
+    if axis is None or len(axis) == 0:
+        axis = [0]
 
-    # reduce_sum does not support float16
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    if in_dygraph_mode():
+        return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
+                                    'reduce_all', reduce_all)
 
-    exp_out = layers.exp(x)
-    sum_out = layers.reduce_sum(exp_out, dim, keepdim)
+    check_variable_and_dtype(x, 'x',
+                             ['float32', 'float64'],
+                             'logsumexp')
 
-    return layers.log(sum_out, name)
+    helper = LayerHelper('logsumexp', **locals())
+    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    return out
 
 
-def inverse(input, name=None):
+def inverse(x, name=None):
     """
-	:alias_main: paddle.inverse
-	:alias: paddle.inverse,paddle.tensor.inverse,paddle.tensor.math.inverse
-
     Takes the inverse of the square matrix. A square matrix is a matrix with
     the same number of rows and columns. The input can be a square matrix
     (2-D Tensor) or batches of square matrices.
 
     Args:
-        input (Variable): The input Variable which holds a Tensor. The last two
+        x (Variable): The input tensor. The last two
             dimensions should be equal. When the number of dimensions is
             greater than 2, it is treated as batches of square matrix. The data
             type can be float32 and float64.
@@ -1017,201 +1254,227 @@ def inverse(input, name=None):
             please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: A Tensor holds the inverse of input. The shape and data type
-            is the same as input.
+        Variable: A Tensor holds the inverse of x. The shape and data type
+                        is the same as x.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
 
             mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
+            paddle.disable_static()
+            mat = paddle.to_variable(mat_np)
+            inv = paddle.inverse(mat)
+            print(inv) # [[0.5, 0], [0, 0.5]]
 
-            # example for static graph
-            input = fluid.data("input", shape=[2, 2], dtype="float32")
-            out = paddle.inverse(input)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            results = exe.run(feed={"input": mat_np },
-                              fetch_list=[out.name])
-            print(results[0]) # [[0.5, 0], [0, 0.5]]
-
-            # example for dynamic graph
-            with fluid.dygraph.guard():
-                mat = fluid.dygraph.to_variable(mat_np)
-                inv = paddle.inverse(mat)
-                print(inv) # [[0.5, 0], [0, 0.5]]
     """
     if in_dygraph_mode():
-        return core.ops.inverse(input)
+        return core.ops.inverse(x)
 
-    def _check_input(input):
-        check_variable_and_dtype(input, 'input',
+    def _check_input(x):
+        check_variable_and_dtype(x, 'x',
                                  ['float32', 'float64'], 'inverse')
-        if len(input.shape) < 2:
+        if len(x.shape) < 2:
             raise ValueError(
                 "The input of inverse is expected to be a Tensor whose number "
                 "of dimensions is no less than 2. But reviced: %d, "
-                "input's shape: %s." % (len(input.shape), input.shape))
-
-    _check_input(input)
-
+                "x's shape: %s." % (len(x.shape), x.shape))
+    _check_input(x)
     helper = LayerHelper('inverse', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
-        type='inverse', inputs={'Input': [input] }, outputs={'Output': [out]})
+        type='inverse', inputs={'Input': [x] }, outputs={'Output': [out]})
     return out
 
 
-def max(input, dim=None, keep_dim=False, name=None):
+def max(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.max
-	:alias: paddle.max,paddle.tensor.max,paddle.tensor.math.max
 
-    Computes the maximum of tensor elements over the given dimension.
+    Computes the maximum of tensor elements over the given axis.
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
+        x(Tensor): A tensor, the data type is float32,
             float64, int32, int64.
-        dim (list|int, optional): The dimension along which the maximum is computed.
+        axis(list|int, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
+             `x` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
+            If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            than the `x` unless :attr:`keepdim` is true, default
             value is False.
         name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, results of maximum on the specified dim of input tensor,
-        it's data type is the same as input's Tensor.
+        Tensor, results of maximum on the specified axis of input tensor,
+        it's data type is the same as `x`.
 
     Examples:
         .. code-block:: python
+
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
 
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.max(x)  # [0.9]
-            paddle.max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
-            paddle.max(x, dim=-1)  # [0.9, 0.7]
-            paddle.max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            paddle.max(y, dim=[1, 2]) # [4.0, 8.0]
-            paddle.max(y, dim=[0, 1]) # [7.0, 8.0]
+            paddle.disable_static()
+
+            # data_x is a variable with shape [2, 4]
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                               [0.1, 0.2, 0.6, 0.7]])
+            x = paddle.to_variable(data_x)
+            result1 = paddle.max(x)
+            print(result1.numpy())
+            #[0.9]
+            result2 = paddle.max(x, axis=0)
+            print(result2.numpy()) 
+            #[0.2 0.3 0.6 0.9]
+            result3 = paddle.max(x, axis=-1)
+            print(result3.numpy())
+            #[0.9 0.7]
+            result4 = paddle.max(x, axis=1, keepdim=True)
+            print(result4.numpy())
+            #[[0.9]
+            # [0.7]]
+
+            # data_y is a variable with shape [2, 2, 2]
+            # the axis is list 
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_variable(data_y)
+            result5 = paddle.max(y, axis=[1, 2])
+            print(result5.numpy())
+            #[4. 8.]
+            result6 = paddle.max(y, axis=[0, 1])
+            print(result6.numpy())
+            #[7. 8.]
     """
 
-    helper = LayerHelper('max', **locals())
-    out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'max')
-
-    reduce_all = True if dim == None or dim == [] else False
-    dim = dim if dim != None and dim != [] else [0]
+    if axis is not None and not isinstance(axis, list):
+        if isinstance(axis, tuple):
+            axis = list(axis)
+        elif isinstance(axis, int):
+            axis= [axis]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
 
+    reduce_all = True if axis == None or axis == [] else False
+    axis = axis if axis != None and axis != [] else [0]
     if in_dygraph_mode():
-        return core.ops.reduce_max(input, 'dim', dim, 'keep_dim', keep_dim,
+        return core.ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
+
+    helper = LayerHelper('max', **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'max')
+
+    out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_max',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs={
-            'dim': dim,
-            'keep_dim': keep_dim,
+            'dim': axis,
+            'keep_dim': keepdim,
             'reduce_all': reduce_all
         })
     return out
 
-
-def min(input, dim=None, keep_dim=False, name=None):
+def min(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.min
-	:alias: paddle.min,paddle.tensor.min,paddle.tensor.math.min
 
-    Computes the minimum of tensor elements over the given dimension.
+    Computes the minimum of tensor elements over the given axis
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
-            float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the minimum is computed.
+        x(Tensor): A tensor, the data type is float32, float64, int32, int64.
+        axis(list|int, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
+            `x` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
+            If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            than the `x` unless :attr:`keepdim` is true, default
             value is False.
         name(str, optional): The default value is None.  Normally there is no need for 
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, result of minimum on the specified dim of input tensor,
+        Tensor, results of minimum on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
 
     Examples:
         .. code-block:: python
+
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.min(x)  # [0.1]
-            paddle.min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
-            paddle.min(x, dim=-1)  # [0.2, 0.1]
-            paddle.min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            paddle.min(y, dim=[1, 2]) # [1.0, 5.0]
-            paddle.min(y, dim=[0, 1]) # [1.0, 2.0]
+
+            paddle.disable_static()
+
+            # data_x is a variable with shape [2, 4]
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                            [0.1, 0.2, 0.6, 0.7]])
+            x = paddle.to_variable(data_x)
+            result1 = paddle.min(x)
+            print(result1.numpy())
+            #[0.1]
+            result2 = paddle.min(x, axis=0)
+            print(result2.numpy())
+            #[0.1 0.2 0.5 0.7]
+            result3 = paddle.min(x, axis=-1)
+            print(result3.numpy()) 
+            #[0.2 0.1]
+            result4 = paddle.min(x, axis=1, keepdim=True)
+            print(result4.numpy())
+            #[[0.2]
+            # [0.1]]
+
+            # data_y is a variable with shape [2, 2, 2]
+            # the axis is list 
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_variable(data_y)
+            result5 = paddle.min(y, axis=[1, 2])
+            print(result5.numpy()) 
+            #[1. 5.]
+            result6 = paddle.min(y, axis=[0, 1])
+            print(result6.numpy())
+            #[1. 2.]
     """
 
-    helper = LayerHelper('min', **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+    if axis is not None and not isinstance(axis, list):
+        if isinstance(axis, tuple):
+            axis = list(axis)
+        elif isinstance(axis, int):
+            axis= [axis]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
+    reduce_all = True if axis == None or axis == [] else False
+    axis = axis if axis != None and axis != [] else [0]
+    if in_dygraph_mode():
+        return core.ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
+                                   'reduce_all', reduce_all)
 
+    helper = LayerHelper('min', **locals())
     check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'max')
-
-    reduce_all = True if dim == None or dim == [] else False
-    dim = dim if dim != None and dim != [] else [0]
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'min')
 
-    if in_dygraph_mode():
-        return core.ops.reduce_min(input, 'dim', dim, 'keep_dim', keep_dim,
-                                   'reduce_all', reduce_all)
+    out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_min',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs={
-            'dim': dim,
-            'keep_dim': keep_dim,
+            'dim': axis,
+            'keep_dim': keepdim,
             'reduce_all': reduce_all
         })
     return out
@@ -1302,14 +1565,14 @@ def addcmul(input, tensor1, tensor2, value=1.0, name=None):
     return out
 
 
-def clamp(input, min=None, max=None, name=None):
+def clip(x, min=None, max=None, name=None):
     """
-	:alias_main: paddle.clamp
-	:alias: paddle.clamp,paddle.tensor.clamp,paddle.tensor.math.clamp
+        :alias_main: paddle.clip
+        :alias: paddle.clip,paddle.tensor.clip,paddle.tensor.math.clip
 
-    **clampe layer**
+    **clip layer**
 
-    This operator clamps all elements in input into the range [ min, max ] and return
+    This operator clip all elements in input into the range [ min, max ] and return
     a resulting tensor as the following equation:
 
     .. math::
@@ -1317,60 +1580,63 @@ def clamp(input, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        input (Variable): An input N-D Tensor or LoDTensor
-            with data type float32, float64.
-        min (float32|Variable): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32 or float64.
+        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Variable): The upper bound with type ``float32`` or a ``Tensor``
+        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor or LodTensor with the same data type and data shape as input's.
+        Tensor: A Tensor with the same data type and data shape as input.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
-            in1 = np.array([[1.2,3.5],
-                            [4.5,6.4]]).astype('float32')
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(in1)
-                out1 = paddle.tensor.clamp(x1, min=3.5, max=5.0)
-                out2 = paddle.tensor.clamp(x1, min=2.5)
-                print(out1.numpy())
-                # [[3.5, 3.5]
-                # [4.5, 5.0]]
-                print(out2.numpy())
-                # [[2.5, 3.5]
-                # [[4.5, 6.4]
+            paddle.disable_static()
+            x = np.array([[1.2,3.5], [4.5,6.4]]).astype('float32')
+            x1 = paddle.to_variable(x)
+            out1 = paddle.clip(x1, min=3.5, max=5.0)
+            out2 = paddle.clip(x1, min=2.5)
+            print(out1.numpy())
+            # [[3.5, 3.5]
+            # [4.5, 5.0]]
+            print(out2.numpy())
+            # [[2.5, 3.5]
+            # [[4.5, 6.4]
     """
 
-    assert min is not None or max is not None, "either min or max should be defined."
+    np_dtype = np.float32
+    if x.dtype == VarDesc.VarType.FP64:
+        np_dtype = np.float64
+    fmin = float(np.finfo(np_dtype).min)
+    fmax = float(np.finfo(np_dtype).max)
 
     if in_dygraph_mode():
-        min = sys.float_info.min if min is None else min
-        max = sys.float_info.max if max is None else max
-        return core.ops.clip(input, "min", min, "max", max)
+        min = fmin if min is None else min
+        max = fmax if max is None else max
+        return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
-        check_type(min, 'min', (float, Variable), 'clamp')
+        check_type(min, 'min', (float, int, Variable), 'clip')
         if isinstance(min, Variable):
             check_dtype(min.dtype, 'min', ['float32', 'float64', 'int32'],
-                        'clamp', '(When the type of min in clamp is Variable.)')
+                        'clip', '(When the type of min in clip is Variable.)')
     if max is not None:
-        check_type(max, 'max', (float, Variable), 'clamp')
+        check_type(max, 'max', (float, int, Variable), 'clip')
         if isinstance(max, Variable):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
-                        'clamp', '(When the type of max in clamp is Variable.)')
+                        'clip', '(When the type of max in clip is Variable.)')
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
 
-    inputs = {'X': input}
-    attrs = {'min': sys.float_info.min, 'max': sys.float_info.max}
+    inputs = {'X': x}
+    attrs = {'min': fmin, 'max': fmax}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
@@ -1384,9 +1650,9 @@ def clamp(input, min=None, max=None, name=None):
     elif max is not None:
         attrs['max'] = max
 
-    helper = LayerHelper('clamp', **locals())
+    helper = LayerHelper('clip', **locals())
     output = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+        dtype=helper.input_dtype())
     helper.append_op(
         type='clip', inputs=inputs, outputs={'Out': [output]}, attrs=attrs)
 
@@ -1432,11 +1698,11 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
             case2 = np.random.randn(3, 10, 10).astype('float32')
             case3 = np.random.randn(3, 10, 5, 10).astype('float32')
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
-            case1 = paddle.imperative.to_variable(case1)
-            case2 = paddle.imperative.to_variable(case2)
-            case3 = paddle.imperative.to_variable(case3)
+            case1 = paddle.to_variable(case1)
+            case2 = paddle.to_variable(case2)
+            case3 = paddle.to_variable(case3)
             data1 = paddle.trace(case1) # data1.shape = [1]
             data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
             data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
@@ -1543,3 +1809,322 @@ ${comment}
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="kron", inputs={"X": x, "Y": y}, outputs={"Out": out})
     return out
+
+
+def cumsum(x, axis=None, dtype=None, name=None):
+    """
+    The cumulative sum of the elements along a given axis. The first element of the result is the same of the first element of the input. 
+
+    Args:
+        x (Tensor): Input of cumsum operator, the Tensor needed to be cumsumed. 
+        axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
+        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the result of cumsum operator, output of cumsum operator. 
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            from paddle import to_variable
+            import numpy as np
+
+            paddle.disable_static()
+            data_np = np.arange(12).reshape(3, 4)
+            data = to_variable(data_np)
+
+            y = paddle.cumsum(data)
+            print(y.numpy())
+            # [ 0  1  3  6 10 15 21 28 36 45 55 66]
+
+            y = paddle.cumsum(data, axis=0)
+            print(y.numpy())
+            # [[ 0  1  2  3]
+            #  [ 4  6  8 10]
+            #  [12 15 18 21]]
+            
+            y = paddle.cumsum(data, axis=-1)
+            print(y.numpy())
+            # [[ 0  1  3  6]
+            #  [ 4  9 15 22]
+            #  [ 8 17 27 38]]
+
+            y = paddle.cumsum(data, dtype='float64')
+            print(y.dtype)
+            # VarType.FP64
+    """
+    if axis is None:
+        flatten = True
+    else:
+        flatten = False
+    if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
+        x = layers.cast(x, dtype)
+
+    if in_dygraph_mode():
+        if axis is None:
+            return core.ops.cumsum(x, 'flatten', flatten)
+        else:
+            return core.ops.cumsum(x, 'axis', axis, 'flatten', flatten)
+
+    check_type(x, 'x', (Variable), 'cumsum')
+    locals_var = locals().copy()
+    kwargs = dict()
+    for name, val in locals_var.items():
+        if val is not None:
+            kwargs[name] = val
+    _cum_sum_ = generate_layer_fn('cumsum')
+    return _cum_sum_(**kwargs)
+
+def isfinite(x, name=None):
+    """
+
+    Return whether every element of input tensor is finite number or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is finite number or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isfinite(x)
+            print(out.numpy())  # [False  True  True False  True False False]
+    """
+    if in_dygraph_mode():
+        return core.ops.isfinite_v2(x)
+    helper = LayerHelper("isfinite_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite')
+    out = helper.create_variable_for_type_inference('bool')
+    helper.append_op(type="isfinite_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+def isinf(x, name=None):
+    """
+
+    Return whether every element of input tensor is `+/-INF` or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is `+/-INF` or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isinf(x)
+            print(out.numpy())  # [ True False False  True False False False]
+    """
+    if in_dygraph_mode():
+        return core.ops.isinf_v2(x)
+    helper = LayerHelper("isinf_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf')
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    helper.append_op(type="isinf_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+def isnan(x, name=None):
+    """
+
+    Return whether every element of input tensor is `NaN` or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is `NaN` or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isnan(x)
+            print(out.numpy())  # [False False False False False  True  True]
+    """
+    if in_dygraph_mode():
+        return core.ops.isnan_v2(x)
+    helper = LayerHelper("isnan_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan')
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    helper.append_op(type="isnan_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def prod(x, axis=None, keepdim=False, dtype=None, name=None):
+    """
+    Compute the product of tensor elements over the given axis.
+
+    Args:
+        x(Tensor): The input tensor, its data type should be float32, float64, int32, int64.
+        axis(int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`, 
+            multiply all elements of `x` and return a Tensor with a single element, 
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`, 
+            the axis to reduce is :math:`x.ndim + axis[i]`. Default is None.
+        dtype(str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
+            int32, int64. If specified, the input tensor is casted to dtype before operator performed. 
+            This is very useful for avoiding data type overflows. The default value is None, the dtype 
+            of output is the same as input Tensor `x`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
+            tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
+        name(string, optional): The default value is None. Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor, result of product on the specified dim of input tensor.
+
+    Raises:
+        ValueError: The :attr:`dtype` must be float32, float64, int32 or int64.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                         [0.1, 0.2, 0.6, 0.7]]).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            out1 = paddle.prod(x)
+            print(out1.numpy())
+            # [0.0002268]
+
+            out2 = paddle.prod(x, -1)
+            print(out2.numpy())
+            # [0.027  0.0084]
+
+            out3 = paddle.prod(x, 0)
+            print(out3.numpy())
+            # [0.02 0.06 0.3  0.63]
+            print(out3.numpy().dtype)
+            # float32
+
+            out4 = paddle.prod(x, 0, keepdim=True)
+            print(out4.numpy())
+            # [[0.02 0.06 0.3  0.63]]
+
+            out5 = paddle.prod(x, 0, dtype='int64')
+            print(out5.numpy())
+            # [0 0 0 0]
+            print(out5.numpy().dtype)
+            # int64
+
+            # the axis is list
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_tensor(data_y)
+            out6 = paddle.prod(y, [0, 1])
+            print(out6.numpy())
+            # [105. 384.]
+
+            out7 = paddle.prod(y, (1, 2))
+            print(out7.numpy())
+            # [  24. 1680.]
+
+    """
+    if dtype is not None:
+        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'prod')
+        if x.dtype != convert_np_dtype_to_dtype_(dtype):
+            x = layers.cast(x, dtype)
+
+    return layers.reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
+
+
+def sign(x, name=None):
+    """
+    This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
+
+    Args:
+        x(Tensor): The input tensor. The data type can be float16, float32 or float64.
+        name (str, optional): The default value is None. Normally there is no need for user to
+            set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: The output sign tensor with identical shape and data type to the input :attr:`x`.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+
+          data = np.array([3.0, 0.0, -2.0, 1.7], dtype='float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(data)
+          out = paddle.sign(x=x)
+          print(out)  # [1.0, 0.0, -1.0, 1.0]
+    """
+    if in_dygraph_mode():
+        return core.ops.sign(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sign')
+    helper = LayerHelper("sign", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(type='sign', inputs={'X': [x]}, outputs={'Out': [out]})
+
+    return out
+
+
+def tanh(x, name=None):
+    """
+    Tanh Activation Operator.
+
+    .. math::
+        out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+    Args:
+        x (Tensor): Input of Tanh operator, an N-D Tensor, with data type float32, float64 or float16.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Output of Tanh operator, a Tensor with same data type and shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+            x = paddle.to_tensor(x_data)
+            out = paddle.tanh(x)
+            print(out.numpy())
+            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+    """
+    if in_dygraph_mode():
+        return core.ops.tanh(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tanh')
+    check_type(x, 'x', (Variable), 'tanh')
+    helper = LayerHelper('tanh', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='tanh', inputs={'X': x}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 5e9f55cd34c3e3e5cedee10352c7a5d96fbb8abc..c652d0f1891c8bd0a4c85ea777527a2fd82ad11b 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,22 +21,411 @@ from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, V
 from ..fluid.layers.layer_function_generator import templatedoc
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ..fluid.layers import utils, uniform_random, gaussian_random
+from ..fluid.layers import utils
 from ..fluid.layers.tensor import fill_constant
+import paddle
+import warnings
 
 from ..fluid.io import shuffle  #DEFINE_ALIAS
 
 __all__ = [
-    #       'gaussin',
-    #       'uniform',
+    'bernoulli',
+    'standard_normal',
+    'normal',
+    'uniform',
     'shuffle',
     'randn',
     'rand',
     'randint',
-    'randperm'
+    'randperm',
 ]
 
 
+def bernoulli(x, name=None):
+    """
+
+    This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+    The input ``x`` is a tensor with probabilities for generating the random binary number.
+    Each element in ``x`` should be in [0, 1], and the out is generated by:
+    
+    .. math::
+
+        out_i ~ Bernoulli (x_i)
+
+    Args:
+        x(Tensor):  A tensor with probabilities for generating the random binary number. The data type 
+            should be float32, float64.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns: 
+        Tensor: A Tensor filled with random binary number with the same shape and dtype as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.rand([2, 3])
+        print(x.numpy())
+        # [[0.11272584 0.3890902  0.7730957 ]
+        # [0.10351662 0.8510418  0.63806665]]
+
+        out = paddle.bernoulli(x)
+        print(out.numpy())
+        # [[0. 0. 1.]
+        # [0. 0. 1.]]
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.bernoulli(x)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
+
+    helper = LayerHelper("randint", **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype)  # maybe set out to int32 ? 
+    helper.append_op(
+        type='bernoulli', inputs={"X": x}, outputs={'Out': out}, attrs={})
+    return out
+
+
+def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a Gaussian
+    distribution, with ``shape`` and ``dtype``.
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        mean(float|int, optional): Mean of the output tensor, default is 0.0.
+        std(float|int, optional): Standard deviation of the output tensor, default
+            is 1.0.
+        seed(int, optional): ${seed_comment}
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+            the output Tensor. Supported data types: float32, float64.
+            Default is float32.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a Gaussian
+        distribution, with ``shape`` and ``dtype``. 
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    seed = 0
+    op_type_for_check = 'gaussian_random/standard_normal/randn/normal'
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.gaussian_random('shape', shape, 'mean',
+                                        float(mean), 'std',
+                                        float(std), 'seed', seed, 'dtype',
+                                        dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), op_type_for_check)
+    check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
+
+    inputs = {}
+    attrs = {
+        'mean': mean,
+        'std': std,
+        'seed': seed,
+        'dtype': dtype,
+        'use_mkldnn': False
+    }
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
+
+    helper = LayerHelper('gaussian_random', **locals())
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='gaussian_random',
+        inputs=inputs,
+        outputs={'Out': out},
+        attrs=attrs)
+    out.stop_gradient = True
+    return out
+
+
+def standard_normal(shape, dtype=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a standard
+    normal distribution with mean 0 and standard deviation 1, with ``shape``
+    and ``dtype``.
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+            output tensor. Supported data types: float32, float64. If ``dytpe``
+            is None, the data type is float32. Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a standard
+        normal distribution with mean 0 and standard deviation 1, with
+        ``shape`` and ``dtype``.
+
+    Raises:
+        TypeError: If ``shape`` is not list, tuple, Tensor.
+        TypeError: If ``dtype`` is not float32, float64.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.standard_normal(shape=[2, 3])
+            # [[-2.923464  ,  0.11934398, -0.51249987],  # random
+            #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.standard_normal(shape=[dim_1, dim_2, 2])
+            # [[[-2.8852394 , -0.25898588],  # random
+            #   [-0.47420555,  0.17683524],  # random
+            #   [-0.7989969 ,  0.00754541]],  # random
+            #  [[ 0.85201347,  0.32320443],  # random
+            #   [ 1.1399018 ,  0.48336947],  # random
+            #   [ 0.8086993 ,  0.6868893 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            var_shape = paddle.to_tensor(np.array([2, 3]))
+            result_3 = paddle.standard_normal(var_shape)
+            # [[-2.878077 ,  0.17099959,  0.05111201]  # random
+            #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
+
+    """
+    if dtype is None:
+        dtype = 'float32'
+
+    return gaussian_random(
+        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
+
+
+randn = standard_normal
+
+
+def normal(mean=0.0, std=1.0, shape=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a normal
+    distribution with ``mean`` and ``std`` (standard deviation) .
+
+    If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``.
+    If ``mean`` is not a Tensor and ``std`` is a Tensor, the output Tensor has the same shape and data type as ``std``.
+    If ``mean`` and ``std`` are not a Tensor, the output Tensor has the same shape as ``shape``, with data type float32.
+
+    If ``mean`` and ``std`` are Tensor, the num of elements of ``mean`` and ``std`` should be the same.
+
+    Args:
+        mean (float|Tensor, optional): The mean of the output Tensor's normal distribution.
+            If ``mean`` is float, all elements of the output Tensor shared the same mean.
+            If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means.
+            Default is 0.0
+        std (float|Tensor, optional): The  standard deviation of the output Tensor's normal distribution.
+            If ``std`` is float, all elements of the output Tensor shared the same standard deviation.
+            If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations.
+            Defaule is 1.0
+        shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64). If ``mean`` or ``std`` is a Tensor, the shape of the output
+            Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored.
+            Default is None
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            out1 = paddle.normal(shape=[2, 3])
+            # [[ 0.17501129  0.32364586  1.561118  ]  # random
+            #  [-1.7232178   1.1545963  -0.76156676]]  # random
+
+            mean_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            out2 = paddle.normal(mean=mean_tensor)
+            # [ 0.18644847 -1.19434458  3.93694787]  # random
+
+            std_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
+            # [1.00780561 3.78457445 5.81058198]  # random
+
+    """
+    if not in_dygraph_mode():
+        check_type(mean, 'mean', (int, float, Variable), 'normal')
+        check_type(std, 'std', (int, float, Variable), 'normal')
+        if isinstance(mean, Variable):
+            check_dtype(
+                mean.dtype, 'mean', ['float32', 'float64'], 'normal',
+                "If mean is Tensor, it's data type only support float32, float64."
+            )
+        if isinstance(std, Variable):
+            check_dtype(
+                std.dtype, 'std', ['float32', 'float64'], 'normal',
+                "If std is Tensor, it's data type only support float32, float64."
+            )
+        if shape is not None:
+            if isinstance(shape, (list, tuple)):
+                for item in shape:
+                    check_type(item, 'shape', (int), 'normal',
+                               'Elements of shape should be int.')
+            elif isinstance(shape, Variable):
+                check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'normal')
+            else:
+                assert TypeError(
+                    'If mean and std are all not Tensor, shape should be list, tuple, Tensor.'
+                )
+
+    if isinstance(mean, Variable):
+        if isinstance(std, Variable):
+            if std.dtype != mean.dtype:
+                std = paddle.cast(std, mean.dtype)
+            mean_shape = paddle.shape(mean)
+            std = paddle.reshape(std, mean_shape)
+        else:
+            std = float(std)
+        out = standard_normal(paddle.shape(mean), mean.dtype, name)
+    elif isinstance(std, Variable):
+        mean = float(mean)
+        out = standard_normal(paddle.shape(std), std.dtype, name)
+    else:
+        return gaussian_random(shape=shape, mean=mean, std=std, name=name)
+
+    out = out * std + mean
+    if not in_dygraph_mode():
+        out.stop_grediant = True
+    return out
+
+
+def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a uniform
+    distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
+
+    Examples:
+    ::
+        Input:
+          shape = [1, 2]
+        Output:
+          result=[[0.8505902, 0.8397286]]
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype(str|np.dtype, optional): The data type of
+            the output Tensor. Supported data types: float32, float64.
+            Default is float32.
+        min(float|int, optional): The lower bound on the range of random values
+            to generate, ``min`` is included in the range. Default is -1.0.
+        max(float|int, optional): The upper bound on the range of random values
+            to generate, ``max`` is excluded in the range. Default is 1.0.
+        seed(int, optional): Random seed used for generating samples. 0 means
+            use a seed generated by the system. Note that if seed is not 0,
+            this operator will always generate the same random numbers every
+            time. Default is 0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a uniform
+        distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
+
+    Raises:
+        TypeError: If ``shape`` is not list, tuple, Tensor.
+        TypeError: If ``dtype`` is not float32, float64.
+
+    Examples:
+        .. code-block:: python
+            
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            # example 1:
+            # attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.tensor.random.uniform(shape=[3, 4])
+            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357],
+            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249],
+            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]]
+
+            # example 2:
+            # attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.tensor.random.uniform(shape=[dim_1, dim_2])
+            # [[-0.9951253,   0.30757582, 0.9899647 ],
+            #  [ 0.5864527,   0.6607096,  -0.8886161 ]]
+
+            # example 3:
+            # attr shape is a Tensor, the data type must be int64 or int32.
+            shape = np.array([2, 3])
+            shape_tensor = paddle.to_tensor(shape)
+            result_3 = paddle.tensor.random.uniform(shape_tensor)
+            # if shape_tensor's value is [2, 3]
+            # result_3 is:
+            # [[-0.8517412,  -0.4006908,   0.2551912 ],
+            #  [ 0.3364414,   0.36278176, -0.16085452]]
+
+
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.uniform_random('shape', shape, 'min',
+                                       float(min), 'max',
+                                       float(max), 'seed', seed, 'dtype', dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+
+    inputs = dict()
+    attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+
+    helper = LayerHelper("uniform_random", **locals())
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="uniform_random", inputs=inputs, attrs=attrs,
+        outputs={"Out": out})
+    return out
+
+
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
 	:alias_main: paddle.randint
@@ -78,40 +467,40 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-
-        # example 1:
-        # attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.randint(low=-5, high=5, shape=[3])
-        # [0, -3, 2]
-
-        # example 2:
-        # attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
-        # [[0, -1, -3],
-        #  [4, -2,  0]]
-
-        # example 3:
-        # attr shape is a Tensor
-        var_shape = paddle.imperative.to_variable(np.array([3]))
-        result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
-        # [-2, 2, 3]
-
-        # example 4:
-        # data type is int32
-        result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
-        # [-5, 4, -4]
-
-        # example 5:
-        # Input only one parameter
-        # low=0, high=10, shape=[1], dtype='int64'
-        result_5 = paddle.randint(10)
-        # [7]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # example 1:
+            # attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.randint(low=-5, high=5, shape=[3])
+            # [0, -3, 2]  # random
+
+            # example 2:
+            # attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+            # [[0, -1, -3],  # random
+            #  [4, -2,  0]]  # random
+
+            # example 3:
+            # attr shape is a Tensor
+            var_shape = paddle.to_variable(np.array([3]))
+            result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+            # [-2, 2, 3]  # random
+
+            # example 4:
+            # data type is int32
+            result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            # [-5, 4, -4]  # random
+
+            # example 5:
+            # Input only one parameter
+            # low=0, high=10, shape=[1], dtype='int64'
+            result_5 = paddle.randint(10)
+            # [7]  # random
 
     """
     if high is None:
@@ -150,77 +539,6 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
-def randn(shape, dtype=None, name=None):
-    """
-	:alias_main: paddle.randn
-	:alias: paddle.tensor.randn, paddle.tensor.random.randn
-
-    This OP returns a Tensor filled with random values sampled from a normal
-    distribution with mean 0 and standard deviation 1 (also called the standard
-    normal distribution), with ``shape`` and ``dtype``.
-
-    Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
-            is a list or tuple, the elements of it should be integers or Tensors
-            (with the shape [1], and the data type int32 or int64). If ``shape``
-            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
-            int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: A Tensor filled with random values sampled from a normal
-        distribution with mean 0 and standard deviation 1 (also called the
-        standard normal distribution), with ``shape`` and ``dtype``.
-
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-
-        # example 1: attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.randn(shape=[2, 3])
-        # [[-2.923464  ,  0.11934398, -0.51249987],
-        #  [ 0.39632758,  0.08177969,  0.2692008 ]]
-
-        # example 2: attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.randn(shape=[dim_1, dim_2, 2])
-        # [[[-2.8852394 , -0.25898588],
-        #   [-0.47420555,  0.17683524],
-        #   [-0.7989969 ,  0.00754541]],
-        #  [[ 0.85201347,  0.32320443],
-        #   [ 1.1399018 ,  0.48336947],
-        #   [ 0.8086993 ,  0.6868893 ]]]
-
-        # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-        var_shape = paddle.imperative.to_variable(np.array([2, 3]))
-        result_3 = paddle.randn(var_shape)
-        # [[-2.878077 ,  0.17099959,  0.05111201]
-        #  [-0.3761474, -1.044801  ,  1.1870178 ]]
-
-    """
-    if dtype is None:
-        dtype = 'float32'
-
-    out = gaussian_random(
-        shape=shape, mean=0.0, std=1.0, seed=0, dtype=dtype, name=name)
-    out.stop_gradient = True
-    return out
-
-
 @templatedoc()
 def randperm(n, dtype="int64", name=None):
     """
@@ -250,15 +568,15 @@ def randperm(n, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
+            import paddle
 
-        paddle.enable_imperative()
+            paddle.disable_static()
 
-        result_1 = paddle.randperm(5)
-        # [4, 1, 2, 3, 0]
+            result_1 = paddle.randperm(5)
+            # [4, 1, 2, 3, 0]  # random
 
-        result_2 = paddle.randperm(7, 'int32')
-        # [1, 6, 2, 0, 4, 3, 5]
+            result_2 = paddle.randperm(7, 'int32')
+            # [1, 6, 2, 0, 4, 3, 5]  # random
  
     """
     if not isinstance(dtype, core.VarDesc.VarType):
@@ -322,36 +640,36 @@ def rand(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-        # example 1: attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.rand(shape=[2, 3])
-        # [[0.451152  , 0.55825245, 0.403311  ],
-        #  [0.22550228, 0.22106001, 0.7877319 ]]
-
-        # example 2: attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
-        # [[[0.8879919 , 0.25788337],
-        #   [0.28826773, 0.9712097 ],
-        #   [0.26438272, 0.01796806]],
-        #  [[0.33633623, 0.28654453],
-        #   [0.79109055, 0.7305809 ],
-        #   [0.870881  , 0.2984597 ]]]
-
-        # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-        var_shape = paddle.imperative.to_variable(np.array([2, 3]))
-        result_3 = paddle.rand(var_shape)
-        # [[0.22920267, 0.841956  , 0.05981819],
-        #  [0.4836288 , 0.24573246, 0.7516129 ]]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.rand(shape=[2, 3])
+            # [[0.451152  , 0.55825245, 0.403311  ],  # random
+            #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+            # [[[0.8879919 , 0.25788337],  # random
+            #   [0.28826773, 0.9712097 ],  # random
+            #   [0.26438272, 0.01796806]],  # random
+            #  [[0.33633623, 0.28654453],  # random
+            #   [0.79109055, 0.7305809 ],  # random
+            #   [0.870881  , 0.2984597 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            var_shape = paddle.to_variable(np.array([2, 3]))
+            result_3 = paddle.rand(var_shape)
+            # [[0.22920267, 0.841956  , 0.05981819],  # random
+            #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
     """
     if dtype is None:
         dtype = 'float32'
 
-    out = uniform_random(shape, dtype, min=0.0, max=1.0, name=name)
+    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
     out.stop_gradient = True
     return out
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 1cb775c9d4b73beaf0f2167fe7fc9909e91d116d..eede022e05ba61bc23da517e7af7cd2eb58f5416 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -21,7 +21,6 @@ from ..fluid import core, layers
 from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
-from ..fluid.layers import topk  #DEFINE_ALIAS
 
 __all__ = [
     'argmax',
@@ -29,13 +28,13 @@ __all__ = [
     'argsort',
     'has_inf',
     'has_nan',
-    #       'masked_select',
+    'masked_select',
     'topk',
     'where',
     'index_select',
     'nonzero',
     'sort',
-    'index_sample'
+    'index_sample',
 ]
 
 from paddle.common_ops_import import *
@@ -68,17 +67,16 @@ def argsort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.imperative as imperative 
             import numpy as np
             
-            paddle.enable_imperative()
+            paddle.disable_static()
             input_array = np.array([[[5,8,9,5],
                             [0,0,1,7],
                             [6,9,2,4]],
                             [[5,2,4,2],
                             [4,7,7,9],
                             [1,7,0,6]]]).astype(np.float32)
-            x = imperative.to_variable(input_array)
+            x = paddle.to_variable(input_array)
             out1 = paddle.argsort(x=x, axis=-1)
             out2 = paddle.argsort(x=x, axis=0)
             out3 = paddle.argsort(x=x, axis=1)
@@ -126,95 +124,168 @@ def argsort(x, axis=-1, descending=False, name=None):
     return ids
 
 
-def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
+def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.argmax
-	:alias: paddle.argmax,paddle.tensor.argmax,paddle.tensor.search.argmax
-
     This OP computes the indices of the max elements of the input tensor's
     element along the provided axis.
 
     Args:
-        input(Variable): An input N-D Tensor with type float32, float64, int16,
+        x(Tensor): An input N-D Tensor with type float32, float64, int16,
             int32, int64, uint8.
         axis(int, optional): Axis to compute indices along. The effective range
-            is [-R, R), where R is Rank(input). when axis<0, it works the same way
-            as axis+R. Default is None, it will use the last dim to select indices of max value.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor which can
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        dtype(str): Data type of the output tensor which can
                     be int32, int64. The default value is None, and it will
                     return the int64 indices.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result. Defalut is None.
-        keepdims(bool, optional): Keep the axis that do the select max.
+        keepdim(bool, optional): Keep the axis that selecting max. The defalut value is False.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with data type int64.
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
             import numpy as np
+            import paddle
 
-            in1 = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]])
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(in1)
-                out1 = paddle.argmax(input=x, axis=-1)
-                out2 = paddle.argmax(input=x, axis=0)
-                out3 = paddle.argmax(input=x, axis=1)
-                out4 = paddle.argmax(input=x, axis=2)
-                out5 = paddle.argmax(input=x, axis=2, keepdims=True)
-                print(out1.numpy())
-                # [[2 3 1]
-                #  [0 3 1]]
-                print(out2.numpy())
-                # [[0 0 0 0]
-                #  [1 1 1 1]
-                #  [0 0 0 1]]
-                print(out3.numpy())
-                # [[2 2 0 1]
-                #  [0 1 1 1]]
-                print(out4.numpy())
-                # [[2 3 1]
-                #  [0 3 1]]
-                print(out5.numpy())
-                #array([[[2],
-                #        [3],
-                #        [1]],
-                #       [[0],
-                #        [3],
-                #        [1]]])
+            paddle.disable_static()
+            data = np.array([[5,8,9,5],
+                             [0,0,1,7],
+                             [6,9,2,4]])
+            x =  paddle.to_variable(data)
+            out1 = paddle.argmax(x)
+            print(out1.numpy()) # 2
+            out2 = paddle.argmax(x, axis=1)
+            print(out2.numpy()) 
+            # [2 3 1]
+            out3 = paddle.argmax(x, axis=-1)
+            print(out3.numpy()) 
+            # [2 3 1]
     """
-    helper = LayerHelper("arg_max", **locals())
+    flatten = False
+    if axis is None:
+        flatten = True
+        axis = 0
+
+    if in_dygraph_mode():
+        if dtype != None:
+            var_dtype = convert_np_dtype_to_dtype_(dtype)
+            out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype,
+                                   'keepdim', keepdim, 'flatten', flatten)
+        else:
+            out = core.ops.arg_max(x, 'axis', axis, 'keepdim', keepdim,
+                                   'flatten', flatten)
+        return out
+
+    helper = LayerHelper("argmax", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
+        'paddle.argmax')
     var_dtype = None
     attrs = {}
     if dtype is not None:
-        check_dtype(dtype, 'create data type', ['int32', 'int64'], 'arg_max')
+        if dtype not in ['int32', 'int64']:
+            raise ValueError(
+                "The value of 'dtype' in argmax op must be int32, int64, but received of {}".
+                format(dtype))
         var_dtype = convert_np_dtype_to_dtype_(dtype)
         attrs["dtype"] = var_dtype
     else:
         var_dtype = VarDesc.VarType.INT64
-    if out is None:
-        out = helper.create_variable_for_type_inference(var_dtype)
+
+    out = helper.create_variable_for_type_inference(var_dtype)
+    attrs['keepdims'] = keepdim
+    attrs['axis'] = axis
+    attrs['flatten'] = flatten
+    helper.append_op(
+        type='arg_max', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
+    out.stop_gradient = True
+    return out
+
+
+def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
+    """
+    This OP computes the indices of the min elements of the input tensor's
+    element along the provided axis.
+
+    Args:
+        x(Tensor): An input N-D Tensor with type float32, float64, int16,
+            int32, int64, uint8.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        dtype(str): Data type of the output tensor which can
+                    be int32, int64. The default value is None, and it will
+                    return the int64 indices.
+        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            data = np.array([[5,8,9,5],
+                             [0,0,1,7],
+                             [6,9,2,4]])
+            x =  paddle.to_variable(data)
+            out1 = paddle.argmin(x)
+            print(out1.numpy()) # 4
+            out2 = paddle.argmin(x, axis=1)
+            print(out2.numpy()) 
+            # [0 0 2]
+            out3 = paddle.argmin(x, axis=-1)
+            print(out3.numpy()) 
+            # [0 0 2]
+    """
+    flatten = False
     if axis is None:
-        axis = -1
-    attrs['keepdims'] = keepdims
+        flatten = True
+        axis = 0
+
+    if in_dygraph_mode():
+        if dtype != None:
+            var_dtype = convert_np_dtype_to_dtype_(dtype)
+            out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype,
+                                   'keepdim', keepdim, 'flatten', flatten)
+        else:
+            out = core.ops.arg_min(x, 'axis', axis, 'keepdim', keepdim,
+                                   'flatten', flatten)
+        return out
+
+    helper = LayerHelper("argmin", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
+        'paddle.argmin')
+    var_dtype = None
+    attrs = {}
+    if dtype is not None:
+        if dtype not in ['int32', 'int64']:
+            raise ValueError(
+                "The value of 'dtype' in argmin op must be int32, int64, but received of {}".
+                format(dtype))
+        var_dtype = convert_np_dtype_to_dtype_(dtype)
+        attrs["dtype"] = var_dtype
+    else:
+        var_dtype = VarDesc.VarType.INT64
+
+    out = helper.create_variable_for_type_inference(var_dtype)
+    attrs['keepdims'] = keepdim
     attrs['axis'] = axis
+    attrs['flatten'] = flatten
     helper.append_op(
-        type='arg_max',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs=attrs)
+        type='arg_min', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -250,14 +321,14 @@ def index_select(x, index, axis=0, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()  # Now we are in imperative mode
+            paddle.disable_static()  # Now we are in imperative mode
             data = np.array([[1.0, 2.0, 3.0, 4.0],
                              [5.0, 6.0, 7.0, 8.0],
                              [9.0, 10.0, 11.0, 12.0]])
             data_index = np.array([0, 1, 1]).astype('int32')
 
-            x = paddle.imperative.to_variable(data)
-            index = paddle.imperative.to_variable(data_index)
+            x = paddle.to_tensor(data)
+            index = paddle.to_tensor(data_index)
             out_z1 = paddle.index_select(x=x, index=index)
             #[[1. 2. 3. 4.]
             # [5. 6. 7. 8.]
@@ -399,17 +470,16 @@ def sort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.imperative as imperative 
             import numpy as np
             
-            paddle.enable_imperative()
+            paddle.disable_static()
             input_array = np.array([[[5,8,9,5],
                             [0,0,1,7],
                             [6,9,2,4]],
                             [[5,2,4,2],
                             [4,7,7,9],
                             [1,7,0,6]]]).astype(np.float32)
-            x = imperative.to_variable(input_array)
+            x = paddle.to_variable(input_array)
             out1 = paddle.sort(x=x, axis=-1)
             out2 = paddle.sort(x=x, axis=0)
             out3 = paddle.sort(x=x, axis=1)
@@ -631,3 +701,154 @@ def index_sample(x, index):
                 'Index': index},
         outputs={'Out': out})
     return out
+
+
+def masked_select(x, mask, name=None):
+    """
+    This OP Returns a new 1-D tensor which indexes the input tensor according to the ``mask``
+    which is a tensor with data type of bool.
+
+    Args:
+        x (Tensor): The input Tensor, the data type can be int32, int64, float32, float64. 
+        mask (Tensor): The Tensor containing the binary mask to index with, it's data type is bool.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns: A 1-D Tensor which is the same data type  as ``x``.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
+        TypeError: ``mask`` must be a Tensor and the data type of ``mask`` must be bool.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            data = np.array([[1.0, 2.0, 3.0, 4.0],
+                                [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
+            
+            mask_data = np.array([[True, False, False, False],
+                            [True, True, False, False],
+                            [True, False, False, False]]).astype('bool')
+            x = paddle.to_tensor(data)
+            mask = paddle.to_tensor(mask_data)
+            out = paddle.masked_select(x, mask)
+            #[1.0 5.0 6.0 9.0]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.masked_select(x, mask)
+
+    helper = LayerHelper("masked_select", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'paddle.tensor.search.mask_select')
+    check_variable_and_dtype(mask, 'mask', ['bool'],
+                             'paddle.tensor.search.masked_select')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='masked_select', inputs={'X': x,
+                                      'Mask': mask}, outputs={'Y': out})
+    return out
+
+
+def topk(x, k, axis=None, largest=True, sorted=True, name=None):
+    """
+    This OP is used to find values and indices of the k largest or smallest at the optional axis.
+    If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
+    If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
+
+    Args:
+        x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
+        k(int, Tensor): The number of top elements to look for along the axis.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is -1.
+        largest(bool, optional) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default is True.
+        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+
+    Examples:
+
+        .. code-block:: python
+
+           import numpy as np
+           import paddle
+
+           paddle.disable_static()
+
+           data_1 = np.array([1, 4, 5, 7])
+           tensor_1 = paddle.to_tensor(data_1)
+           value_1, indices_1 = paddle.topk(tensor_1, k=1)
+           print(value_1.numpy())
+           # [7]
+           print(indices_1.numpy())
+           # [3] 
+           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
+           tensor_2 = paddle.to_tensor(data_2)
+           value_2, indices_2 = paddle.topk(tensor_2, k=1)
+           print(value_2.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_2.numpy())
+           # [[3]
+           #  [1]]
+           value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
+           print(value_3.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_3.numpy())
+           # [[3]
+           #  [1]]
+           value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
+           print(value_4.numpy())
+           # [[2 6 5 7]]
+           print(indices_4.numpy())
+           # [[1 1 0 0]]
+
+    """
+    if in_dygraph_mode():
+        k = k.numpy().item(0) if isinstance(k, Variable) else k
+        if axis is None:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'largest', largest,
+                                             'sorted', sorted)
+        else:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'axis', axis, 'largest',
+                                             largest, 'sorted', sorted)
+        return out, indices
+
+    helper = LayerHelper("top_k_v2", **locals())
+    inputs = {"X": [x]}
+    attrs = {}
+    if isinstance(k, Variable):
+        inputs['K'] = [k]
+    else:
+        attrs = {'k': k}
+    attrs['largest'] = largest
+    attrs['sorted'] = sorted
+    if axis is not None:
+        attrs['axis'] = axis
+
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 9b3bb081d9776a7ef88245f76d564c7b107ca669..91676a6316b81a1998b9b48fb9ea7fcba6d67c25 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -13,152 +13,253 @@
 # limitations under the License.
 
 # TODO: define statistical functions of a tensor  
-from ..fluid.layers import mean  #DEFINE_ALIAS
 from ..fluid.layers import reduce_mean  #DEFINE_ALIAS
 
-__all__ = ['mean', 'reduce_mean', 'std', 'var']
+__all__ = ['mean', 'reduce_mean', 'std', 'var', 'numel']
 
 import numpy as np
+from ..fluid.framework import Variable
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import in_dygraph_mode
+from ..fluid.framework import core, in_dygraph_mode
 from ..fluid import layers
 from .search import where
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+import paddle
 
 
-def var(input, axis=None, keepdim=False, unbiased=True, out=None, name=None):
+def mean(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.var
-	:alias: paddle.var,paddle.tensor.var,paddle.tensor.stat.var
-
-    Computes the variance of the input Variable's elements along the specified 
-    axis.
+    Computes the mean of the input tensor's elements along ``axis``.
 
     Args:
-        input (Variable): The input Variable to be computed variance, with data 
-            type float32 and float64 supported.
-        axis (list|int, optional): The axis along which the variance is computed. 
-            If `None`, compute the variance over all elements of :attr:`input`
-            and return a Variable with a single element, otherwise it must be in 
-            the range :math:`[-rank(input), rank(input))`. If :math:`axis[i] < 0`, 
-            the axis to compute is :math:`rank(input) + axis[i]`.
-        keepdim (bool, optional): Whether to reserve the reduced dimensions in 
-            the output Variable. The dimensions in :attr:`axis` will be squeezed 
-            and the result Variable will have :attr:`len(axis)` fewer dimensions 
-            than the :attr:`input` unless :attr:`keepdim` is true, default False.
-        unbiased (bool, optional): Whether to compute variance via the unbiased 
-            estimator, in which the divisor used in the computation is 
-            :math:`N - 1`, where :math:`N` represents the number of elements 
-            along :attr:`axis`, otherwise the divisor is :math:`N`. Default True.
-        out (Variable, optional): Alternate output Variable to store the result
-            variance. Default None.
-        name (str, optional): The name for this layer. Normally there is no 
-            need for user to set this property.  For more information, please 
-            refer to :ref:`api_guide_Name`. Default None.
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform mean
+            calculations. ``axis`` should be int, list(int) or tuple(int). If
+            ``axis`` is a list/tuple of dimension(s), mean is calculated along
+            all element(s) of ``axis`` . ``axis`` or element(s) of ``axis``
+            should be in range [-D, D), where D is the dimensions of ``x`` . If
+            ``axis`` or element(s) of ``axis`` is less than 0, it works the
+            same way as :math:`axis + D` . If ``axis`` is None, mean is
+            calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: The result variance with the same dtype as :attr:`input`. 
-            If :attr:`out = None`, returns a new Variable containing the 
-            variance, otherwise returns a reference to the output Variable.
+        Tensor, results of average along ``axis`` of ``x``, with the same data
+        type as ``x``.
 
     Examples:
         .. code-block:: python
 
+            import paddle
             import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[1, 2, 3, 4],
+                           [5, 6, 7, 8],
+                           [9, 10, 11, 12]],
+                          [[13, 14, 15, 16],
+                           [17, 18, 19, 20],
+                           [21, 22, 23, 24]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = paddle.mean(x)
+            # [12.5]
+            out2 = paddle.mean(x, axis=-1)
+            # [[ 2.5  6.5 10.5]
+            #  [14.5 18.5 22.5]]
+            out3 = paddle.mean(x, axis=-1, keepdim=True)
+            # [[[ 2.5]
+            #   [ 6.5]
+            #   [10.5]]
+            #  [[14.5]
+            #   [18.5]
+            #   [22.5]]]
+            out4 = paddle.mean(x, axis=[0, 2])
+            # [ 8.5 12.5 16.5]
+    """
+
+    if isinstance(axis, int):
+        axis = [axis]
+    reduce_all = True if axis is None \
+        or len(axis)==0 \
+        or len(axis) == len(x.shape) else False
+    if axis is None or len(axis) == 0:
+        axis = [0]
+
+    if in_dygraph_mode():
+        return core.ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
+                                    'reduce_all', reduce_all)
+
+    check_variable_and_dtype(x, 'x/input', ['float32', 'float64'],
+                             'mean/reduce_mean')
+    check_type(axis, 'axis/dim', (int, list, tuple), 'mean/reduce_mean')
+    if isinstance(axis, (list, tuple)):
+        for item in axis:
+            check_type(item, 'elements of axis/dim', (int), 'mean/reduce_mean')
+
+    helper = LayerHelper('mean', **locals())
+    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='reduce_mean', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
+def var(x, axis=None, unbiased=True, keepdim=False, name=None):
+    """
+    Computes the variance of ``x`` along ``axis`` .
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            variance calculations. ``axis`` should be int, list(int) or
+            tuple(int). If ``axis`` is a list/tuple of dimension(s), variance
+            is calculated along all element(s) of ``axis`` . ``axis`` or
+            element(s) of ``axis`` should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is less
+            than 0, it works the same way as :math:`axis + D` . If ``axis`` is
+            None, variance is calculated over all elements of ``x``. Default
+            is None.
+        unbiased (bool, optional): Whether to use the unbiased estimation. If
+            ``unbiased`` is True, the divisor used in the computation is
+            :math:`N - 1`, where :math:`N` represents the number of elements
+            along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of variance along ``axis`` of ``x``, with the same data
+        type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
             import paddle
-            import paddle.fluid.dygraph as dg
-
-            a = np.array([[1.0, 2.0], [3.0, 4.0]]).astype("float32")
-            with dg.guard():
-                data = dg.to_variable(a)
-                variance = paddle.var(data, axis=[1])
-                print(variance.numpy())   
-                # [0.5 0.5]
+            import numpy as np
+            
+            paddle.disable_static()
+
+            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
+            x = paddle.to_tensor(x)
+            out1 = paddle.var(x)
+            # [2.66666667]
+            out2 = paddle.var(x, axis=1)
+            # [1.         4.33333333]
     """
-    dtype = convert_dtype(input.dtype)
-    if dtype not in ["float32", "float64"]:
-        raise ValueError("Layer tensor.var() only supports floating-point "
-                         "dtypes, but received {}.".format(dtype))
-    rank = len(input.shape)
-    axes = axis if axis != None and axis != [] else range(rank)
-    axes = [e if e >= 0 else e + rank for e in axes]
-    inp_shape = input.shape if in_dygraph_mode() else layers.shape(input)
-    mean = layers.reduce_mean(input, dim=axis, keep_dim=True, name=name)
-    tmp = layers.reduce_mean(
-        (input - mean)**2, dim=axis, keep_dim=keepdim, name=name)
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
+
+    u = mean(x, axis, True, name)
+    out = paddle.sum((x - u)**2, axis, keepdim=keepdim, name=name)
 
+    n = paddle.cast(paddle.numel(x), x.dtype) \
+        / paddle.cast(paddle.numel(out), x.dtype)
     if unbiased:
-        n = 1
-        for i in axes:
-            n *= inp_shape[i]
-        if not in_dygraph_mode():
-            n = layers.cast(n, dtype)
-            zero_const = layers.fill_constant(shape=[1], dtype=dtype, value=0.0)
-            factor = where(n > 1.0, n / (n - 1.0), zero_const)
-        else:
-            factor = n / (n - 1.0) if n > 1.0 else 0.0
-        tmp *= factor
-    if out:
-        layers.assign(input=tmp, output=out)
-        return out
-    else:
-        return tmp
-
-
-def std(input, axis=None, keepdim=False, unbiased=True, out=None, name=None):
+        one_const = paddle.ones([1], x.dtype)
+        n = where(n > one_const, n - 1., one_const)
+    out /= n
+    return out
+
+
+def std(x, axis=None, unbiased=True, keepdim=False, name=None):
+    """
+    Computes the standard-deviation of ``x`` along ``axis`` .
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            standard-deviation calculations. ``axis`` should be int, list(int)
+            or tuple(int). If ``axis`` is a list/tuple of dimension(s),
+            standard-deviation is calculated along all element(s) of ``axis`` .
+            ``axis`` or element(s) of ``axis`` should be in range [-D, D),
+            where D is the dimensions of ``x`` . If ``axis`` or element(s) of
+            ``axis`` is less than 0, it works the same way as :math:`axis + D` .
+            If ``axis`` is None, standard-deviation is calculated over all
+            elements of ``x``. Default is None.
+        unbiased (bool, optional): Whether to use the unbiased estimation. If
+            ``unbiased`` is True, the standard-deviation is calculated via the
+            unbiased estimator. If ``unbiased`` is True,  the divisor used in
+            the computation is :math:`N - 1`, where :math:`N` represents the
+            number of elements along ``axis`` , otherwise the divisor is
+            :math:`N`. Default is True.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of standard-deviation along ``axis`` of ``x``, with the
+        same data type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+
+            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
+            x = paddle.to_tensor(x)
+            out1 = paddle.std(x)
+            # [1.63299316]
+            out2 = paddle.std(x, axis=1)
+            # [1.       2.081666]
     """
-	:alias_main: paddle.std
-	:alias: paddle.std,paddle.tensor.std,paddle.tensor.stat.std
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'std')
 
-    Computes the standard-deviation  of the input Variable's elements along the specified 
-    axis.
+    out = var(**locals())
+    return paddle.sqrt(out)
+
+
+def numel(x, name=None):
+    """
+    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static mode
+    or a scalar value in imperative mode
 
     Args:
-        input (Variable): The input Variable to be computed standard-deviation, with data 
-            type float32 and float64 supported.
-        axis (list|int, optional): The axis along which the standard-deviation is computed. 
-            If `None`, compute the standard-deviation over all elements of :attr:`input`
-            and return a Variable with a single element, otherwise it must be in 
-            the range :math:`[-rank(input), rank(input))`. If :math:`axis[i] < 0`, 
-            the axis to compute is :math:`rank(input) + axis[i]`.
-        keepdim (bool, optional): Whether to reserve the reduced dimensions in 
-            the output Variable. The dimensions in :attr:`axis` will be squeezed 
-            and the result Variable will have :attr:`len(axis)` fewer dimensions 
-            than the :attr:`input` unless :attr:`keepdim` is true, default False.
-        unbiased (bool, optional): Whether to compute standard-deviation via the unbiased 
-            estimator, in which the divisor used in the computation is 
-            :math:`N - 1`, where :math:`N` represents the number of elements 
-            along :attr:`axis`, otherwise the divisor is :math:`N`. Default True.
-        out (Variable, optional): Alternate output Variable to store the result
-            standard-deviation . Default None.
-        name (str, optional): The name for this layer. Normally there is no 
-            need for user to set this property.  For more information, please 
-            refer to :ref:`api_guide_Name`. Default None.
+        x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
 
     Returns:
-        Variable: The result standard-deviation  with the same dtype as :attr:`input`. 
-            If :attr:`out = None`, returns a new Variable containing the 
-            standard-deviation , otherwise returns a reference to the output Variable.
+        Tensor: The number of elements for the input Tensor.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
+
+
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.std(x)  # [0.28252685] 
-            paddle.std(x, axis=[0])  # [0.0707107, 0.07071075, 0.07071064, 0.1414217]
-            paddle.std(x, axis=[-1])  # [0.30956957, 0.29439208] 
+            
+            paddle.disable_static()
+            x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
+            numel = paddle.numel(x) # 140
+
+
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'std')
-
-    tmp = var(input, axis=axis, keepdim=keepdim, unbiased=unbiased, name=name)
-    tmp = layers.sqrt(tmp)
-    if out is not None:
-        layers.assign(input=tmp, output=out)
-        return out
-    else:
-        return tmp
+    if in_dygraph_mode():
+        return core.ops.size(x)
+
+    if not isinstance(x, Variable):
+        raise TypeError("x must be a Tensor in numel")
+    helper = LayerHelper('numel', **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64)
+    helper.append_op(type='size', inputs={'Input': x}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4
--- /dev/null
+++ b/python/paddle/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/tests/test_metrics.py b/python/paddle/tests/test_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..2272a81b3f602ec46972c9d4620ded9680e2ff5f
--- /dev/null
+++ b/python/paddle/tests/test_metrics.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
+from paddle.incubate.hapi.utils import to_list
+
+
+def accuracy(pred, label, topk=(1, )):
+    maxk = max(topk)
+    pred = np.argsort(pred)[:, ::-1][:, :maxk]
+    correct = (pred == np.repeat(label, maxk, 1))
+
+    batch_size = label.shape[0]
+    res = []
+    for k in topk:
+        correct_k = correct[:, :k].sum()
+        res.append(float(correct_k) / batch_size)
+    return res
+
+
+def convert_to_one_hot(y, C):
+    oh = np.random.choice(np.arange(C), C, replace=False).astype('float32') / C
+    oh = np.tile(oh[np.newaxis, :], (y.shape[0], 1))
+    for i in range(y.shape[0]):
+        oh[i, int(y[i])] = 1.
+    return oh
+
+
+class TestAccuracy(unittest.TestCase):
+    def test_acc(self):
+        paddle.disable_static()
+
+        x = paddle.to_tensor(
+            np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.4, 0.3, 0.2],
+                      [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        m = paddle.metric.Accuracy(name='my_acc')
+
+        # check name
+        self.assertEqual(m.name(), ['my_acc'])
+
+        correct = m.compute(x, y)
+        # check results
+        self.assertEqual(m.update(correct), 0.75)
+        self.assertEqual(m.accumulate(), 0.75)
+
+        x = paddle.to_tensor(
+            np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.3, 0.4, 0.2],
+                      [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+        correct = m.compute(x, y)
+        # check results
+        self.assertEqual(m.update(correct), 0.5)
+        self.assertEqual(m.accumulate(), 0.625)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.total[0], 0.0)
+        self.assertEqual(m.count[0], 0.0)
+        paddle.enable_static()
+
+
+class TestAccuracyDynamic(unittest.TestCase):
+    def setUp(self):
+        self.topk = (1, )
+        self.class_num = 5
+        self.sample_num = 1000
+        self.name = None
+
+    def random_pred_label(self):
+        label = np.random.randint(0, self.class_num,
+                                  (self.sample_num, 1)).astype('int64')
+        pred = np.random.randint(0, self.class_num,
+                                 (self.sample_num, 1)).astype('int32')
+        pred_one_hot = convert_to_one_hot(pred, self.class_num)
+        pred_one_hot = pred_one_hot.astype('float32')
+
+        return label, pred_one_hot
+
+    def test_main(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
+            for _ in range(10):
+                label, pred = self.random_pred_label()
+                label_var = paddle.to_tensor(label)
+                pred_var = paddle.to_tensor(pred)
+                state = to_list(acc.compute(pred_var, label_var))
+                acc.update(* [s.numpy() for s in state])
+                res_m = acc.accumulate()
+                res_f = accuracy(pred, label, self.topk)
+                assert np.all(np.isclose(np.array(res_m, dtype='float64'),
+                              np.array(res_f, dtype='float64'), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+                acc.reset()
+                assert np.sum(acc.total) == 0
+                assert np.sum(acc.count) == 0
+
+
+class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 1000
+        self.name = "accuracy"
+
+
+class TestAccuracyStatic(TestAccuracyDynamic):
+    def test_main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        main_prog.random_seed = 1024
+        startup_prog.random_seed = 1024
+        with fluid.program_guard(main_prog, startup_prog):
+            pred = fluid.data(
+                name='pred', shape=[None, self.class_num], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
+            state = acc.compute(pred, label)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        compiled_main_prog = fluid.CompiledProgram(main_prog)
+
+        for _ in range(10):
+            label, pred = self.random_pred_label()
+            state_ret = exe.run(compiled_main_prog,
+                                feed={'pred': pred,
+                                      'label': label},
+                                fetch_list=[s.name for s in to_list(state)],
+                                return_numpy=True)
+            acc.update(*state_ret)
+            res_m = acc.accumulate()
+            res_f = accuracy(pred, label, self.topk)
+            assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+            acc.reset()
+            assert np.sum(acc.total) == 0
+            assert np.sum(acc.count) == 0
+
+
+class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 100
+        self.name = "accuracy"
+
+
+class TestPrecision(unittest.TestCase):
+    def test_1d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = paddle.to_tensor(np.array([0.1, 0.5, 0.6, 0.7, 0.2]))
+        y = paddle.to_tensor(np.array([1, 0, 1, 1, 1]))
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 4. / 6.)
+
+        paddle.enable_static()
+
+    def test_2d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7]).reshape(-1, 1)
+        y = np.array([1, 0, 1, 1]).reshape(-1, 1)
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = np.array([0.1, 0.5, 0.6, 0.7, 0.2]).reshape(-1, 1)
+        y = np.array([1, 0, 1, 1, 1]).reshape(-1, 1)
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 4. / 6.)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.tp, 0.0)
+        self.assertEqual(m.fp, 0.0)
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+
+class TestRecall(unittest.TestCase):
+    def test_1d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Recall()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = paddle.to_tensor(np.array([0.1, 0.5, 0.6, 0.7]))
+        y = paddle.to_tensor(np.array([1, 0, 0, 1]))
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 3. / 5.)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.tp, 0.0)
+        self.assertEqual(m.fn, 0.0)
+        self.assertEqual(m.accumulate(), 0.0)
+        paddle.enable_static()
+
+
+class TestAuc(unittest.TestCase):
+    def test_auc_numpy(self):
+        paddle.disable_static()
+        x = np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
+                      [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]])
+        y = np.array([[0], [1], [1], [0], [1], [0], [0], [1]])
+        m = paddle.metric.Auc()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 0.8125)
+
+        m.reset()
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+    def test_auc_tensor(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(
+            np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
+                      [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]]))
+        y = paddle.to_tensor(np.array([[0], [1], [1], [0], [1], [0], [0], [1]]))
+        m = paddle.metric.Auc()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 0.8125)
+
+        m.reset()
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 27621c2d872a6d10ec3259312abe318fef5b334b..08fd7e33479b331454f63f05f6240dd221591ee9 100644
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -19,6 +19,14 @@ import warnings
 import functools
 import paddle
 
+# NOTE(zhiqiu): Since python 3.2, DeprecationWarning is ignored by default,
+# and since python 3.7, it is once again shown by default when triggered directly by code in __main__.
+# See details: https://docs.python.org/3/library/warnings.html#default-warning-filter
+# The following line set DeprecationWarning to show once, which is expected to work in python 3.2 -> 3.6
+# However, doing this could introduce one samll side effect, i.e., the DeprecationWarning which is not issued by @deprecated.
+# The side effect is acceptable, and we will find better way to do this if we could.
+warnings.simplefilter('default', DeprecationWarning)
+
 
 def deprecated(update_to="", since="", reason=""):
     """Decorate a function to signify its deprecation.
@@ -36,6 +44,8 @@ def deprecated(update_to="", since="", reason=""):
     """
 
     def decorator(func):
+        # TODO(zhiqiu): We temporally disable the warnings for 2.0-bata, and it should be re-enabled in the future.
+        return func
         """construct warning message, and return a decorated function or class."""
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
@@ -54,7 +64,7 @@ def deprecated(update_to="", since="", reason=""):
                 "paddle."
             ), 'Argument update_to must start with "paddle.", your value is "{}"'.format(
                 update_to)
-            msg += ' Use "{}" instead.'.format(_update_to)
+            msg += ' Please use "{}" instead.'.format(_update_to)
         if len(_reason) > 0:
             msg += "\n reason: {}".format(_reason)
 
@@ -70,11 +80,8 @@ def deprecated(update_to="", since="", reason=""):
             v_since = [int(i) for i in _since.split(".")]
             v_since += [0] * (4 - len(v_since))
             if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
-                warnings.simplefilter('always',
-                                      DeprecationWarning)  # turn off filter
                 warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
-                warnings.simplefilter('default',
-                                      DeprecationWarning)  # reset filter
+
             return func(*args, **kwargs)
 
         return wrapper
diff --git a/python/requirements.txt b/python/requirements.txt
index 13a1c9a9d638daf6a78f52d9d66fcf3f15b74c37..5e81ec680897024e7c32d193bef1716e9b25b4a4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,12 +1,13 @@
+opencv-python<=4.2.0.32
 requests>=2.20.0
-numpy>=1.12, <=1.16.4 ; python_version<"3.5"
-numpy>=1.12 ; python_version>="3.5"
+numpy>=1.13, <=1.16.4 ; python_version<"3.5"
+numpy>=1.13 ; python_version>="3.5"
 protobuf>=3.1.0
-gast>=0.3.3
+gast==0.3.3
 matplotlib<=2.2.4 ; python_version<"3.6"
 scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
 nltk>=3.2.2, <=3.4 ; python_version<"3.5"
-matplotlib ; python_version>="3.6"
+matplotlib<=3.2.1 ; python_version>="3.6"
 scipy<=1.3.1 ; python_version=="3.5"
 scipy ; python_version>"3.5"
 nltk ; python_version>="3.5"
@@ -22,3 +23,4 @@ objgraph
 astor
 pathlib
 netifaces
+psutil
diff --git a/python/setup.py.in b/python/setup.py.in
index b2e2811dea3e6bec685c0f3c1499f1ee8d0b0d4d..5b206296bd641bf909115d1c580518afe85a37b6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -145,18 +145,20 @@ packages=['paddle',
           'paddle.incubate',
           'paddle.incubate.complex',
           'paddle.incubate.complex.tensor',
-          'paddle.fleet',
-          'paddle.fleet.base',
-          'paddle.fleet.meta_optimizers',
-          'paddle.fleet.runtime',
-          'paddle.fleet.dataset',
-          'paddle.fleet.metrics',
-          'paddle.fleet.proto',
-          'paddle.fleet.utils',
+          'paddle.distributed.fleet',
+          'paddle.distributed.fleet.base',
+          'paddle.distributed.fleet.meta_optimizers',
+          'paddle.distributed.fleet.runtime',
+          'paddle.distributed.fleet.dataset',
+          'paddle.distributed.fleet.metrics',
+          'paddle.distributed.fleet.proto',
+          'paddle.distributed.fleet.utils',
           'paddle.framework',
+          'paddle.jit',
           'paddle.fluid',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
+          'paddle.fluid.dygraph.amp',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.distributed',
@@ -199,9 +201,10 @@ packages=['paddle',
           'paddle.nn.functional',
           'paddle.nn.layer',
           'paddle.nn.initializer',
+          'paddle.nn.utils',
           'paddle.metric',
-          'paddle.imperative',
-          'paddle.imperative.jit',
+          'paddle.static',
+          'paddle.static.nn',
           'paddle.tensor',
           ]
 
@@ -300,6 +303,23 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_XPU}' == 'ON':
+    # only change rpath in Release mode,
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command)
+    shutil.copy('${XPU_API_LIB}', libs_path)
+    shutil.copy('${XPU_RT_LIB}', libs_path)
+    shutil.copy('${XPU_SIM_LIB}', libs_path)
+    package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
+                                  '${XPU_RT_LIB_NAME}',
+                                  '${XPU_SIM_LIB_NAME}']
+
 # copy libfuild_framework.so to libs
 if os.name != 'nt' and sys.platform != 'darwin':
     paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
@@ -480,7 +500,7 @@ with redirect_stdout():
         },
         entry_points={
             'console_scripts': [
-                'fleetrun = paddle.fleet.launch:launch'
+                'fleetrun = paddle.distributed.fleet.launch:launch'
             ]
         }
     )
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index b1f3f84b36ee295529661cf74b13e71d620254c9..f7ee09e11ea5e3a3b5ba4ce6b3be8af4abe7cae4 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -5,48 +5,10 @@ if [ -z ${BRANCH} ]; then
 fi
 
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-API_FILES=("CMakeLists.txt"
-           "paddle/fluid/framework/operator.h"
-           "paddle/fluid/framework/tensor.h"
-           "paddle/fluid/framework/details/op_registry.h"
-           "paddle/fluid/framework/grad_op_desc_maker.h"
-           "paddle/fluid/framework/lod_tensor.h"
-           "paddle/fluid/framework/selected_rows.h"
-           "paddle/fluid/framework/op_desc.h"
-           "paddle/fluid/framework/block_desc.h"
-           "paddle/fluid/framework/var_desc.h"
-           "paddle/fluid/framework/scope.h"
-           "paddle/fluid/framework/ir/node.h"
-           "paddle/fluid/framework/ir/graph.h"
-           "paddle/fluid/framework/framework.proto"
-	   "python/paddle/fleet/__init__.py"
-           "python/requirements.txt"
-           "python/paddle/fluid/__init__.py"
-           "python/paddle/fluid/compiler.py"
-           "python/paddle/fluid/parallel_executor.py"
-           "python/paddle/fluid/framework.py"
-           "python/paddle/fluid/backward.py"
-           "paddle/fluid/operators/distributed/send_recv.proto.in"
-           "paddle/fluid/framework/unused_var_check.cc"
-           "paddle/fluid/pybind/op_function_generator.cc"
-           "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
-           "tools/wlist.json"
-           )
-
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-git_files=`git diff --numstat upstream/$BRANCH| wc -l`
-git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
 failed_num=0
 echo_list=()
 
-
 function check_approval(){
     person_num=`echo $@|awk '{for (i=2;i<=NF;i++)print $i}'`
     APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py $1 $person_num`
@@ -55,18 +17,12 @@ function check_approval(){
     fi
 }
 
-
 function add_failed(){
     failed_num=`expr $failed_num + 1`
     echo_list="${echo_list[@]}$1"
 }
 
 
-if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
-    check_approval 1 38231817
-fi    
-
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` 
 if [ "$api_spec_diff" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01 or lanxianghit) and one TPM (saxon-zh or jzhang533 or swtkiwi or Heeenrrry or TCChenlong) approval for the api change for the management reason of API interface.\n"
@@ -101,213 +57,6 @@ if [ "$op_desc_diff" != "" ]; then
     check_approval 1 33742067 7913861 9301846 47554610 43953930
 fi
 
-for API_FILE in ${API_FILES[*]}; do
-  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
-  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
-      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-      # You can use http://caius.github.io/github_id/ to find Github user id.
-      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059,Boyan-Liu 31623103, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, JepsonWong 16509038, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676. Dong Daxiang 35550832.
-      if [ "${API_FILE}" == "CMakeLists.txt" ];then
-          echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          check_approval 1 6836917 46782768
-      elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          check_approval 1 6836917 47554610
-      elif [ "${API_FILE}" == "python/requirements.txt" ];then
-          echo_line="You must have one RD (kolinwei (Recommend), JepsonWong or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
-          check_approval 1 22165420 16509038 6836917
-      elif [ "${API_FILE}" == "paddle/fluid/operators/distributed/send_recv.proto.in" ];then
-          echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
-          check_approval 1 10721757 5442383
-      elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
-          echo_line="You must have one RD (zhiqiu (Recommend) , sneaxiy or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
-          check_approval 1 6888866 32832641 6836917
-      elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then
-          echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n"
-          check_approval 1 6888866 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py" ];then
-          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (hong19860320 (Recommend), luotao1, phlrain) approval for the changes of check_shape_white_list.py, which manages the white list of operators with limited input size. Inputs size of all cases in the op test must be greater than or equal to 100. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/OP-Test-Input-Shape-Requirements. \n"
-          check_approval 1 9973393 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py" ];then
-          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 (Recommend) or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py, which manages the white list of upgrading the precision of op test to float64. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
-          check_approval 1 52520497 26615455 6836917
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py" ];then
-           echo_line="You must have one RD (DannyIsFunny (Recommend), luotao1, phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py, which manages the white list of compile&runtime lod-level check. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Compile_vs_Runtime-Check-Specification. \n"
-          check_approval 1 45189361 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py" ];then
-          echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
-          check_approval 1 12407750 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
-          echo_line="You must have one RD (JepsonWong (Recommend), luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
-          check_approval 1 16509038 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then
-          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
-          check_approval 1 52520497 26615455 6836917
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then
-          echo_line="You must have one RD (songyouwei, luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
-          check_approval 1 2573291 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
-        echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
-        check_approval 1 39303645 6836917 43953930
-      elif [ "${API_FILE}" == "tools/wlist.json" ];then
-        echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
-        check_approval 1 29231
-      elif [ "${API_FILE}" == "python/paddle/fleet/__init__.py" ]; then
-	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
-	check_approval 1 35550832 38231817
-      else
-          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          check_approval 1 3048612 46782768 12538138 6836917 32832641
-      fi
-  fi
-done
-
-HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
-if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for the usage (either add or delete) of const_cast.\n"
-    check_approval 1 3048612 46782768 12538138 6836917 32832641
-fi
-
-HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -o -m 1 "boost::get" || true`
-if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="boost::get is not recommended, because it may throw an bad_get exception without any stack information, so please use BOOST_GET(_**)(dtype, value) series macros here. If these macros cannot meet your needs, please use try-catch to handle boost::get and request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
-if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
-if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag.\n"
-    check_approval 1 47554610
-fi
-
-HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
-if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), liuwei1031, or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762
-  fi
-
-HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
-if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
-    check_approval 1 328693 6836917
-  fi
-
-ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
-if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
-VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
-INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
-if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
-ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""
-for CHANGE_FILE in ${ALL_CHANGE_FILES}; do
-    ALL_OPTEST_BAN_DYGRAPH=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CHANGE_FILE} | grep "+" | grep "check_dygraph=" || true`
-    if [ "${ALL_OPTEST_BAN_DYGRAPH}" != "" ]; then
-        ALL_OPTEST_BAN_DYGRAPH_MESSAGE="${ALL_OPTEST_BAN_DYGRAPH_MESSAGE} ${CHANGE_FILE} : \n${ALL_OPTEST_BAN_DYGRAPH} \n"
-    fi
-done
-if [ "${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Developers are not allowed to set the check_dygraph field directly, which is set to True by default. If you need to change the check_dygraph field, you must have one RD (phlrain (Recommend) or lanxianghit) review and approve. \nThe code that do not meet the specification are as follows:\n${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}\n"
-    check_approval 1 43953930 47554610
-fi
-
-NEW_OP_ADDED=`git diff --name-only --diff-filter=A upstream/$BRANCH |grep -oE ".+_op..*" || true`
-if [ "${NEW_OP_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    GET_KERNEL_TYPE_FUNC_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -czoE "GetExpectedKernelType[(][^(){}]+[)][^{]+[{][^}]+[}]" || true`
-    INDICATE_VAR_DTYPE_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -co "IndicateVarDataType" || true`
-    if [ ${GET_KERNEL_TYPE_FUNC_CNT} -gt ${INDICATE_VAR_DTYPE_CNT} ]; then
-        echo_line="If you override GetExpectedKernelType method of OperatorWithKernel, please use OperatorWithKernel::IndicateVarDataType() method to get specific input variable's dtype, which checked whether the input variable is initialized (The details in https://github.com/PaddlePaddle/FluidDoc/pull/1527). If you don't use this method to check, you must have one RD (chenwhql (Recommend) , luotao1 or lanxianghit) approval for the usage of other methods.\n"
-        check_approval 1 6836917 47554610 22561442
-    fi
-fi
-
-HAS_OPERATORBASE_FLAG=`git diff -U0 --diff-filter=A upstream/$BRANCH | grep -E "public[[:space:]]+.*OperatorBase" || true`
-if [ "${HAS_OPERATORBASE_FLAG}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="In order to support dynamic graph, all ops are not recommended to inherit OperatorBase. Please use OperatorWithKernel instead.\nYou must have one RD (phlrain (Recommend), luotao1, lanxianghit or XiaoguangHu01) approval for the inherit of OperatorBase.\nYou inherit the OperatorBase class. The corresponding lines are as follows:\n${HAS_OPERATORBASE_FLAG}"
-    check_approval 1 43953930 6836917 47554610 46782768
-fi
-
-HAS_INPLACE_TESTS=`git diff -U0 upstream/$BRANCH |grep "+" |grep -E "inplace_atol[[:space:]]*=.*" || true`
-if [ "${HAS_INPLACE_TESTS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="The calculation results of setting inplace enabled and disabled must be equal, that is, it's not recommended to set inplace_atol.\n If you do need to use inplace_atol, you must have one RD (XiaoguangHu01, lanxianghit, phlrain, luotao1) approval for the usage of inplace_atol.\nThe corresponding lines are as follows:\n${HAS_INPLACE_TESTS}\n"
-    check_approval 1 46782768 47554610 43953930 6836917
-fi
-
-OP_FILE_CHANGED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE ".+_op..*" || true`
-if [ "${OP_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    for OP_FILE in ${OP_FILE_CHANGED};
-    do
-        CHECK_OBJECT_FLAGS=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${OP_FILE} |grep "+" |grep -E "ShareDataWith[(]|ShareBufferWith[(]" || true`
-        if [ "${CHECK_OBJECT_FLAGS}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${OP_FILE}${CHECK_OBJECT_FLAGS}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="Using ShareDataWith or ShareBufferWith is not recommended. You must have one RD's (zhhsplendid (Recommend), sneaxiy or luotao1 or lanxianghit) approval to use these methods. For more information, please refer to https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-is-prohibited-in-OP. The error lines are as follows:${ERROR_LINES}"
-        check_approval 1 6836917 32832641 47554610 7913861
-    fi
-fi
-
-NEW_OP_TEST_ADDED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE "test_.*.\.py" || true`
-if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    CHECK_OUTPUT=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep "self\.check_output(a*t*o*l*=*[0-9]"|grep "+" || true`
-    CHECK_OUTPUT_WITH_PLACE=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 "self\.check_output_with_place" |grep ", [atol*,0-9]"|grep "+" || true`
-    CHECK_GRAD=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A5 -E "self\.check_grad|self\.check_grad_with_place"|grep "max_relative_error=" |grep "+" || true`
-    CHECK_GRAD_CHECK=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 -E "checker\.double_grad_check"|grep "eps=|atol=|rtol=" |grep "+" || true`
-    CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
-    if [ "${CHECK_WHOLE}" != "" ] ; then
-        CHECK_OP=${CHECK_WHOLE//+/'\n+'}       
-        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
-        check_approval 1 6836917 47554610 12538138 43953930
-    fi
-fi
-
-UNITTEST_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "test_.*.\.py" || true`
-if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
-    do
-        HAS_SKIP_CHECK_GRAD_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "@skip_check_grad_ci" || true`
-        if [ "${HAS_SKIP_CHECK_GRAD_CI}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_GRAD_CI}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (zhangting2020 (Recommend), luotao1 or phlrain) approval for the usage (either add or delete) of @skip_check_grad_ci. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Gradient-Check-Is-Required-for-Op-Test. The corresponding lines are as follows:\n${ERROR_LINES}\n"
-        check_approval 1 26615455 6836917 43953930
-    fi
-fi
-
-RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
-if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
-    do
-        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|PROPERTIES[[:space:]]+TIMEOUT" || true`
-	if [[ ${RUNTYPE_ADD} != "" ]];then
-	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
-	fi
-    done
-    if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
-        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE to EXCLUSIVE or DIST, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
-	check_approval 1 32428676 45041955
-    fi
-fi
-
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
 ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}` 
@@ -316,16 +65,6 @@ if [ "${ADDED_OP_USE_DEFAULT_GRAD_MAKER}" != "" ]; then
   check_approval 1 32832641 6836917
 fi
 
-# Get the list of PR authors with unresolved unit test issues
-pip install PyGithub
-# For getting PR related data
-wget https://paddle-ci.gz.bcebos.com/blk/block.txt
-HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
-if [ "${HASUTFIXED}" != "" ]; then
-  echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
-  check_approval 1 45041955 22165420
-fi
-
 if [ -n "${echo_list}" ];then
   echo "****************"
   echo -e "${echo_list[@]}"
@@ -336,5 +75,5 @@ fi
 python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec
 python ${PADDLE_ROOT}/tools/check_op_register_type.py ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_PR.spec
 if [ -n "${echo_list}" ]; then
-  exit 1
+  exit 6
 fi
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2c575e4abf1beed039d3293821b8df356d4e9295
--- /dev/null
+++ b/tools/check_file_diff_approvals.sh
@@ -0,0 +1,301 @@
+#!/bin/bash
+
+if [ -z ${BRANCH} ]; then
+    BRANCH="develop"
+fi
+
+PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+API_FILES=("CMakeLists.txt"
+           "paddle/fluid/framework/operator.h"
+           "paddle/fluid/framework/tensor.h"
+           "paddle/fluid/framework/details/op_registry.h"
+           "paddle/fluid/framework/grad_op_desc_maker.h"
+           "paddle/fluid/framework/lod_tensor.h"
+           "paddle/fluid/framework/selected_rows.h"
+           "paddle/fluid/framework/op_desc.h"
+           "paddle/fluid/framework/block_desc.h"
+           "paddle/fluid/framework/var_desc.h"
+           "paddle/fluid/framework/scope.h"
+           "paddle/fluid/framework/ir/node.h"
+           "paddle/fluid/framework/ir/graph.h"
+           "paddle/fluid/framework/framework.proto"
+	   "python/paddle/distributed/__init"
+	   "python/paddle/distributed/fleet/__init__.py"
+           "python/requirements.txt"
+           "python/paddle/fluid/__init__.py"
+           "python/paddle/fluid/compiler.py"
+           "python/paddle/fluid/parallel_executor.py"
+           "python/paddle/fluid/framework.py"
+           "python/paddle/fluid/backward.py"
+           "paddle/fluid/operators/distributed/send_recv.proto.in"
+           "paddle/fluid/framework/unused_var_check.cc"
+           "paddle/fluid/pybind/op_function_generator.cc"
+           "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
+           "tools/wlist.json"
+           )
+
+approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+git_files=`git diff --numstat upstream/$BRANCH| wc -l`
+git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
+failed_num=0
+echo_list=()
+
+
+function check_approval(){
+    person_num=`echo $@|awk '{for (i=2;i<=NF;i++)print $i}'`
+    APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py $1 $person_num`
+    if [[ "${APPROVALS}" == "FALSE" && "${echo_line}" != "" ]]; then
+        add_failed "${failed_num}. ${echo_line}"
+    fi
+}
+
+
+function add_failed(){
+    failed_num=`expr $failed_num + 1`
+    echo_list="${echo_list[@]}$1"
+}
+
+
+if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
+    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
+    check_approval 1 38231817
+fi
+
+for API_FILE in ${API_FILES[*]}; do
+  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
+  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
+      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+      # You can use http://caius.github.io/github_id/ to find Github user id.
+      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676. Dong Daxiang 35550832.
+      if [ "${API_FILE}" == "CMakeLists.txt" ];then
+          echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
+          check_approval 1 6836917 46782768
+      elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
+          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
+          check_approval 1 6836917 47554610
+      elif [ "${API_FILE}" == "python/requirements.txt" ];then
+          echo_line="You must have one RD (kolinwei (Recommend) or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
+          check_approval 1 22165420 6836917
+      elif [ "${API_FILE}" == "paddle/fluid/operators/distributed/send_recv.proto.in" ];then
+          echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
+          check_approval 1 10721757 5442383
+      elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
+          echo_line="You must have one RD (zhiqiu (Recommend) or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
+          check_approval 1 6888866 6836917
+      elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then
+          echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n"
+          check_approval 1 6888866 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py" ];then
+          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (hong19860320 (Recommend), luotao1, phlrain) approval for the changes of check_shape_white_list.py, which manages the white list of operators with limited input size. Inputs size of all cases in the op test must be greater than or equal to 100. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/OP-Test-Input-Shape-Requirements. \n"
+          check_approval 1 9973393 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py" ];then
+          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 (Recommend) or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py, which manages the white list of upgrading the precision of op test to float64. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
+          check_approval 1 52520497 26615455 6836917
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py" ];then
+           echo_line="You must have one RD (DannyIsFunny (Recommend), luotao1, phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py, which manages the white list of compile&runtime lod-level check. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Compile_vs_Runtime-Check-Specification. \n"
+          check_approval 1 45189361 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py" ];then
+          echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
+          check_approval 1 12407750 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
+          echo_line="You must have one RD (luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then
+          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
+          check_approval 1 52520497 26615455 6836917
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then
+          echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
+        echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
+        check_approval 1 39303645 6836917 43953930
+      elif [ "${API_FILE}" == "tools/wlist.json" ];then
+        echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
+        check_approval 1 29231
+      elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
+	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	check_approval 1 35550832 38231817
+      elif [ "${API_FILE}" == "python/paddle/distributed/__init__.py" ]; then
+	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	check_approval 1 35550832 38231817
+      else
+          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
+          check_approval 1 3048612 46782768 12538138 6836917
+      fi
+  fi
+done
+
+FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
+HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER |grep -o -m 1 "const_cast" || true`
+if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for the usage (either add or delete) of const_cast.\n"
+    check_approval 1 3048612 46782768 12538138 6836917
+fi
+
+HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true`
+if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="boost::get is not recommended, because it may throw an bad_get exception without any stack information, so please use BOOST_GET(_**)(dtype, value) series macros here. If these macros cannot meet your needs, please use try-catch to handle boost::get and request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
+if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
+if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag.\n"
+    check_approval 1 47554610
+fi
+
+HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
+if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), liuwei1031, or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762
+  fi
+
+HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
+if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
+    check_approval 1 328693 6836917
+  fi
+
+ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
+if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
+INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
+if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
+ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""
+for CHANGE_FILE in ${ALL_CHANGE_FILES}; do
+    ALL_OPTEST_BAN_DYGRAPH=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CHANGE_FILE} | grep "+" | grep "check_dygraph=" || true`
+    if [ "${ALL_OPTEST_BAN_DYGRAPH}" != "" ]; then
+        ALL_OPTEST_BAN_DYGRAPH_MESSAGE="${ALL_OPTEST_BAN_DYGRAPH_MESSAGE} ${CHANGE_FILE} : \n${ALL_OPTEST_BAN_DYGRAPH} \n"
+    fi
+done
+if [ "${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="Developers are not allowed to set the check_dygraph field directly, which is set to True by default. If you need to change the check_dygraph field, you must have one RD (phlrain (Recommend) or lanxianghit) review and approve. \nThe code that do not meet the specification are as follows:\n${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}\n"
+    check_approval 1 43953930 47554610
+fi
+
+NEW_OP_ADDED=`git diff --name-only --diff-filter=A upstream/$BRANCH |grep -oE ".+_op..*" || true`
+if [ "${NEW_OP_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    GET_KERNEL_TYPE_FUNC_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -czoE "GetExpectedKernelType[(][^(){}]+[)][^{]+[{][^}]+[}]" || true`
+    INDICATE_VAR_DTYPE_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -co "IndicateVarDataType" || true`
+    if [ ${GET_KERNEL_TYPE_FUNC_CNT} -gt ${INDICATE_VAR_DTYPE_CNT} ]; then
+        echo_line="If you override GetExpectedKernelType method of OperatorWithKernel, please use OperatorWithKernel::IndicateVarDataType() method to get specific input variable's dtype, which checked whether the input variable is initialized (The details in https://github.com/PaddlePaddle/FluidDoc/pull/1527). If you don't use this method to check, you must have one RD (chenwhql (Recommend) , luotao1 or lanxianghit) approval for the usage of other methods.\n"
+        check_approval 1 6836917 47554610 22561442
+    fi
+fi
+
+HAS_OPERATORBASE_FLAG=`git diff -U0 --diff-filter=A upstream/$BRANCH | grep -E "public[[:space:]]+.*OperatorBase" || true`
+if [ "${HAS_OPERATORBASE_FLAG}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="In order to support dynamic graph, all ops are not recommended to inherit OperatorBase. Please use OperatorWithKernel instead.\nYou must have one RD (phlrain (Recommend), luotao1, lanxianghit or XiaoguangHu01) approval for the inherit of OperatorBase.\nYou inherit the OperatorBase class. The corresponding lines are as follows:\n${HAS_OPERATORBASE_FLAG}"
+    check_approval 1 43953930 6836917 47554610 46782768
+fi
+
+HAS_INPLACE_TESTS=`git diff -U0 upstream/$BRANCH |grep "+" |grep -E "inplace_atol[[:space:]]*=.*" || true`
+if [ "${HAS_INPLACE_TESTS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="The calculation results of setting inplace enabled and disabled must be equal, that is, it's not recommended to set inplace_atol.\n If you do need to use inplace_atol, you must have one RD (XiaoguangHu01, lanxianghit, phlrain, luotao1) approval for the usage of inplace_atol.\nThe corresponding lines are as follows:\n${HAS_INPLACE_TESTS}\n"
+    check_approval 1 46782768 47554610 43953930 6836917
+fi
+
+OP_FILE_CHANGED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE ".+_op..*" || true`
+if [ "${OP_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for OP_FILE in ${OP_FILE_CHANGED};
+    do
+        CHECK_OBJECT_FLAGS=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${OP_FILE} |grep "+" |grep -E "ShareDataWith[(]|ShareBufferWith[(]" || true`
+        if [ "${CHECK_OBJECT_FLAGS}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${OP_FILE}${CHECK_OBJECT_FLAGS}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="Using ShareDataWith or ShareBufferWith is not recommended. You must have one RD's (zhhsplendid (Recommend), zhiqiu or luotao1 or lanxianghit) approval to use these methods. For more information, please refer to https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-is-prohibited-in-OP. The error lines are as follows:${ERROR_LINES}"
+        check_approval 1 6836917 6888866 47554610 7913861
+    fi
+fi
+
+NEW_OP_TEST_ADDED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE "test_.*.\.py" || true`
+if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    CHECK_OUTPUT=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep "self\.check_output(a*t*o*l*=*[0-9]"|grep "+" || true`
+    CHECK_OUTPUT_WITH_PLACE=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 "self\.check_output_with_place" |grep ", [atol*,0-9]"|grep "+" || true`
+    CHECK_GRAD=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A5 -E "self\.check_grad|self\.check_grad_with_place"|grep "max_relative_error=" |grep "+" || true`
+    CHECK_GRAD_CHECK=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 -E "checker\.double_grad_check"|grep "eps=|atol=|rtol=" |grep "+" || true`
+    CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
+    if [ "${CHECK_WHOLE}" != "" ] ; then
+        CHECK_OP=${CHECK_WHOLE//+/'\n+'}       
+        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
+        check_approval 1 6836917 47554610 12538138 43953930
+    fi
+fi
+
+UNITTEST_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "test_.*.\.py" || true`
+if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
+    do
+        HAS_SKIP_CHECK_GRAD_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "@skip_check_grad_ci" || true`
+        if [ "${HAS_SKIP_CHECK_GRAD_CI}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_GRAD_CI}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (zhangting2020 (Recommend), luotao1 or phlrain) approval for the usage (either add or delete) of @skip_check_grad_ci. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Gradient-Check-Is-Required-for-Op-Test. The corresponding lines are as follows:\n${ERROR_LINES}\n"
+        check_approval 1 26615455 6836917 43953930
+    fi
+fi
+
+RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
+if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
+    do
+        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|PROPERTIES[[:space:]]+TIMEOUT" || true`
+	if [[ ${RUNTYPE_ADD} != "" ]];then
+	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
+	fi
+    done
+    if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
+        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE to EXCLUSIVE or DIST, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
+	check_approval 1 32428676 45041955
+    fi
+fi
+
+# Get the list of PR authors with unresolved unit test issues
+pip install PyGithub
+# For getting PR related data
+wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
+if [ "${HASUTFIXED}" != "" ]; then
+  echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
+  check_approval 1 45041955 22165420
+fi
+
+if [ -n "${echo_list}" ];then
+  echo "****************"
+  echo -e "${echo_list[@]}"
+  echo "There are ${failed_num} approved errors."
+  echo "****************"
+fi
+
+if [ -n "${echo_list}" ]; then
+  exit 6
+fi
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 049621b9388997deaeab618c09c579858a60d47e..b10e76a4b4d037bfa0d72e74e660cf696f5ee1d3 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index f424d676f70b127d84469bd70d9e7161a93f7bba..9fe58885fa553671cf5c08bd51295f271f4df668 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
+    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python && \
+    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install opencv-python==4.2.0.32
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
@@ -219,4 +219,11 @@ RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0
 RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 CMD source ~/.bashrc
 
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
 EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 6f201a8579fea29ec6eaabf1faca77da26b11882..9f937cf9343784f10d186dd5bdcbace6f8a4e0e9 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -89,7 +89,7 @@ function do_cpython_build {
     fi
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
     ls ${MY_DIR}
     local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3ab1e68b37557404197cf552cbd0a4def08e9c41
--- /dev/null
+++ b/tools/gen_alias_mapping.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Brief:
+#     This code is used for generating the mapping list of Paddle API alias.
+#     Only the APIs set with the `DEFINE_ALIAS` flag is enable.
+# 
+# Arguments:
+#     None
+# 
+# Usage:
+#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`     
+#
+# Returns:
+#     succ: 0
+# 
+#     Will also print the mapping list to stdout. The format of each line is as below:
+#         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
+
+
+PADDLE_ROOT="$(dirname $(readlink -f ${BASH_SOURCE[0]}))/.."
+
+find ${PADDLE_ROOT}/python/ -name '*.py' \
+    | xargs  grep -v '^#' \
+    | grep 'DEFINE_ALIAS' \
+    | perl -ne '
+        if (/\/python\/(.*):from (\.*)(\w.*) import (.*?)\s+#DEFINE_ALIAS\s+$/) {
+            my @arr = split(", ", $4); 
+            foreach $i (@arr) {
+                printf "%s|%s|%s|%d\n", $3, $i, substr($1, 0, -3), length($2);
+            }
+        }' \
+    | awk -F '[|/]' '
+        {
+            key = "";
+            val = "";
+            if ($2 ~ /.* as .*/) {
+                split($2, arr, " as ");
+                old = arr[1];
+                new = arr[2];
+            } else {
+                old = $2;
+                new = $2;
+            }
+            for (i = 3; i <= (NF - 1 - $NF); ++i) {
+                val = val""$i".";
+            }
+            val =  val""$1"."old
+            for (i = 3; i <= (NF - 1); ++i) {
+                if ($i != "__init__") {
+                    key = key""$i".";
+                }
+            }
+            key = key""new;
+            n2o[key] = val;
+        } 
+        END {
+            for (new in n2o) {
+                old = n2o[new] in n2o ? n2o[n2o[new]] : n2o[new];
+                print old, length(new), new;
+            }
+        }' \
+    | sort -k 1,1 -k 2n,2 \
+    | awk '
+        {
+            o2n[$1] = o2n[$1] ? o2n[$1]","$3 : $3;
+        }
+        END { 
+            for (i in o2n) {
+                print i"\t"o2n[i];
+            }
+        }'
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
index ffef02dba4614f7bbbe13ebc30b40438a52b4590..e3a3374b943bc955d54afbef9755ed5147fad7d2 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 837f0e486f6112bfc645c55ded8dfd0726d414d6..c27fdcea2401c26b1ef1dd377c42930b6e74fcf0 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure] 
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 76e1c8baddcd5eeb24f1093d679934d2bbd90730..a18774a8b57b6424e0d89188c537a2086f5aa183 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,6 +27,7 @@ import pydoc
 import hashlib
 import six
 import functools
+import logging
 
 member_dict = collections.OrderedDict()
 
@@ -97,8 +98,11 @@ def queue_dict(member, cur_name):
     member_dict[cur_name] = "({}, ('document', '{}'))".format(args, doc_md5)
 
 
-def visit_member(parent_name, member):
-    cur_name = ".".join([parent_name, member.__name__])
+def visit_member(parent_name, member, member_name=None):
+    if member_name:
+        cur_name = ".".join([parent_name, member_name])
+    else:
+        cur_name = ".".join([parent_name, member.__name__])
     if inspect.isclass(member):
         queue_dict(member, cur_name)
         for name, value in inspect.getmembers(member):
@@ -163,7 +167,13 @@ def visit_all_module(mod):
         if inspect.ismodule(instance):
             visit_all_module(instance)
         else:
-            visit_member(mod.__name__, instance)
+            if member_name != instance.__name__:
+                logging.warn(
+                    "Found alias API, alias name is: {}, original name is: {}".
+                    format(member_name, instance.__name__))
+                visit_member(mod.__name__, instance, member_name)
+            else:
+                visit_member(mod.__name__, instance)
 
 
 modules = sys.argv[1].split(",")
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 102b50c43aeabc6ab2c67840edfaf42615cf51f5..033b4b8723aa30465cdb07198f470d7c09a0f326 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -480,14 +480,8 @@ def get_filenames():
                 filename = ''
                 print("\nWARNING:----Exception in get api filename----\n")
                 print("\n" + api + ' module is ' + module + "\n")
-            if filename != '':
-                # rm contrib file
-                if filename.startswith(
-                        '../python/paddle/fluid/contrib'
-                ) or filename == '../python/paddle/verison.py':
-                    pass
-                elif filename not in filenames:
-                    filenames.append(filename)
+            if filename != '' and filename not in filenames:
+                filenames.append(filename)
             # get all methods
             method = ''
             if inspect.isclass(eval(api)):
@@ -557,14 +551,18 @@ def get_wlist():
 
     '''
     wlist = []
+    wlist_file = []
     with open("wlist.json", 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
-            wlist = wlist + load_dict[key]
-    return wlist
+            if key == 'wlist_file':
+                wlist_file = wlist_file + load_dict[key]
+            else:
+                wlist = wlist + load_dict[key]
+    return wlist, wlist_file
 
 
-wlist = get_wlist()
+wlist, wlist_file = get_wlist()
 
 if len(sys.argv) < 2:
     print("Error: inadequate number of arguments")
@@ -590,8 +588,14 @@ else:
     if len(filenames) == 0 and len(whl_error) == 0:
         print("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    elif '../python/paddle/fluid/core_avx.py' in filenames:
-        filenames.remove('../python/paddle/fluid/core_avx.py')
+    rm_file = []
+    for f in filenames:
+        for w_file in wlist_file:
+            if f.startswith(w_file):
+                rm_file.append(f)
+                filenames.remove(f)
+    if len(rm_file) != 0:
+        print("REMOVE white files: %s" % rm_file)
     print("API_PR is diff from API_DEV: %s" % filenames)
     one_part_filenum = int(math.ceil(len(filenames) / cpus))
     if one_part_filenum == 0:
diff --git a/tools/wlist.json b/tools/wlist.json
index 6989882504eded7c56851e6e9351cef9b4975137..ce6f5fb176b5baa66a480566d8aa884620c5332c 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -1,4 +1,10 @@
 {
+    "wlist_file" : [
+        "../python/paddle/fluid/contrib", 
+        "../python/paddle/verison.py",
+        "../python/paddle/fluid/core_avx.py",
+        "../python/paddle/distributed"
+    ],
     "wlist_inneed":[
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",
@@ -63,7 +69,6 @@
         "Compressor",
         "Compressor.config",
         "Compressor.run",
-        "run_check",
         "HDFSClient.upload",
         "HDFSClient.download",
         "HDFSClient.is_exist",
@@ -107,12 +112,27 @@
         "Metric.update",
         "Metric.accumulate",
         "Metric.name",
-        "Metric.add_metric_op",
+        "Metric.compute",
         "Accuracy.reset",
         "Accuracy.update",
         "Accuracy.accumulate",
         "Accuracy.name",
-        "Accuracy.add_metric_op",
+        "Accuracy.compute",
+        "Precision.reset",
+        "Precision.update",
+        "Precision.accumulate",
+        "Precision.name",
+        "Precision.compute",
+        "Recall.reset",
+        "Recall.update",
+        "Recall.accumulate",
+        "Recall.name",
+        "Recall.compute",
+        "Auc.reset",
+        "Auc.update",
+        "Auc.accumulate",
+        "Auc.name",
+        "Auc.compute",
         "Callback.set_params",
         "Callback.on_train_begin",
         "Callback.on_train_end",
@@ -128,7 +148,20 @@
         "Callback.on_eval_batch_end",
         "Callback.on_test_batch_begin",
         "Callback.on_test_batch_end",
-        "Model.prepare"
+        "Model.prepare",
+        "SimpleRNNCell",
+        "SimpleRNNCell.forward",
+        "LSTMCell",
+        "LSTMCell.forward",
+        "GRUCell",
+        "GRUCell.forward",
+        "SimpleRNN",
+        "GRU",
+        "LSTM",
+        "RNN",
+        "BiRNN",
+        "RNNCellBase",
+        "RNNCellBase.get_initial_states"
     ],
     "wlist_no_op_pass":[
         "gelu",